stock_news_summaries_AI / dataset_creation /crawling_nasdaq_news.py
mdj1412
demo
b476907
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
from tqdm import trange
import datetime
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nasdaq100_crawling
nasdaq_url = "https://www.marketscreener.com/quote/index/NASDAQ-100-4946/components/col=7&asc=0&fpage={}"
# Ticker 마다 뉴스 기사 목록 사이트 확인하기
def get_codezb():
# Execute "nasdaq100_crawling" Module
nasdaq_dic = pd.DataFrame(nasdaq100_crawling.get_nasdaq100())
# Get Nasdaq 100 List
nasdaq100_codezb = pd.DataFrame(index=nasdaq_dic.ticker, columns=['url'])
count=0
for page in trange(1, 4):
print("page : ", page)
url = nasdaq_url.format(page)
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 한 페이지에 50개 목록
row = soup.select('table [class="tabBodyLV17"] tr')
for idx in trange(1, len(row)):
count+=1
# 해당 회사의 url
idx_url = "https://www.marketscreener.com" + row[idx].select('td')[1].find('a')['href']
# html 정보 가져오기
idx_html = urllib.request.urlopen(idx_url).read()
idx_soup = BeautifulSoup(idx_html, 'html.parser')
ticker = idx_soup.select('span [class=fvTitleInfo]')[0].text
nasdaq100_codezb.loc[ticker, 'url'] = idx_url + "news-quality/"
nasdaq100_codezb.to_csv('./dataset_creation/nasdaq_url.tsv', index=True, sep='\t')
print("{} / {}".format(count, len(nasdaq_dic.ticker)))
from IPython import embed; embed()
# 문자열 길이 확인 그리고 뉴스 기사 길이
def get_textLength_and_newsCount():
total_count=0
ranking = {}
newsTextLengthList = []
dir = './news'
for ticker in os.listdir(dir):
ticker_count=0
tickerTextLength_avg=0.0
dir2 = os.path.join(dir, ticker)
if ticker == ".DS_Store":
os.remove(dir2) #파일삭제
continue
for date in os.listdir(dir2):
dir3 = os.path.join(dir2, date)
if date == ".DS_Store":
os.remove(dir3) #파일삭제
continue
if date < "2023.01.01":
for title in os.listdir(dir3):
remove_dic = os.path.join(dir3, title)
os.remove(remove_dic) #파일삭제
os.rmdir(dir3) #해당 directory 삭제
continue
# print(date)
for title in os.listdir(dir3):
dir4=os.path.join(dir3, title)
f = open(dir4, 'r')
data = f.read()
length = len(data)
newsTextLengthList.append(length)
tickerTextLength_avg+=length
ticker_count+=1
total_count+=1
avg = 0.0
if ticker_count != 0:
avg = tickerTextLength_avg/ticker_count
ranking[ticker]=[ticker_count, avg]
print("Ranking\t| Ticker\t| # of news\t| Average of News Text Length")
sorted_pairs = sorted(ranking.items(), key=lambda x: -x[1][0])
tickers, values = [], []
for i, (ticker, element) in enumerate(sorted_pairs, start=1):
tickers.append(ticker)
values.append(element[0])
print("{}\t{}\t{}\t{:.2f}".format(i, ticker, element[0], element[1]))
# Draw Graph
x = np.arange(len(os.listdir(dir)))
plt.figure(figsize=(14, 6)) # Size of Window
plt.bar(x=x, height=values, color='C2') # 막대그래프 그리기
plt.xticks(ticks=x, labels=tickers, rotation=90, fontsize=5) # X값 표시
plt.tick_params(axis='x', direction='in', length=3, pad=6, labelcolor='blue')
plt.title("Number of News Data ( NASDAQ 100 )") # Write Title
plt.xlabel('Tickers') # Write X-axis
plt.ylabel('# of News') # Write Y-axis
plt.show()
# Show Total Number of News and News Text Length
print("======================================================")
print("{} : {}".format("전체 뉴스 갯수", total_count))
df = pd.DataFrame(ranking, index=["Number of News", "Average of News Text Length"])
df = df.transpose()
df.to_excel("dataset_creation/tickers_numAndAvg.xlsx")
print(df["Number of News"].describe())
text_length_df = pd.DataFrame(newsTextLengthList, columns=["News Text Length"])
print(text_length_df.describe())
text_length_df.to_excel("dataset_creation/textLength.xlsx")
from IPython import embed; embed()
def get_news(tickers, boundary_date='2023.01.01'):
if not os.path.exists('./news'):
os.mkdir('./news')
# NASDAQ Tickers List 가져오기
nasdaq100_codezb = pd.read_csv('./dataset_creation/nasdaq_url.tsv', sep='\t', index_col='ticker')
nasdaq_tickers = list(nasdaq100_codezb.index)
total_count=0
for ticker in tickers:
num=0
print("============================== {} ==============================".format(ticker))
# 해당 Ticker 의 Directory 가 존재하는지 확인
if not os.path.exists('./news/' + ticker):
os.mkdir('./news/' + ticker)
# 해당 Ticker 가 NASDAQ Tickers List 에 존재하지 않을 때,
if ticker not in nasdaq_tickers:
print("[ Check NASDAQ Tickers List ]")
print(ticker, "is not in NASDAQ 100")
from IPython import embed; embed()
# 해당 Ticker 가 News URL 을 가지고 있지 않을 때,
if pd.isna(nasdaq100_codezb.loc[ticker, 'url']):
print("[ Check get_codezb() Method ]")
print(ticker, "has not News URL")
if ticker == "TEAM":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/ATLASSIAN-CORPORATION-25531314/news-quality/"
elif ticker == "BKR":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/BAKER-HUGHES-COMPANY-40311111/news-quality/"
elif ticker == "CSGP":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/COSTAR-GROUP-INC-8923/news-quality/"
elif ticker == "FANG":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/DIAMONDBACK-ENERGY-INC-11732858/news-quality/"
elif ticker == "ENPH":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/ENPHASE-ENERGY-INC-10335237/news-quality/"
elif ticker == "GFS":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/GLOBALFOUNDRIES-INC-128691269/news-quality/"
elif ticker == "RIVN":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/RIVIAN-AUTOMOTIVE-INC-129226108/news-quality/"
elif ticker == "WBD":
nasdaq100_codezb.loc[ticker, 'url']="https://www.marketscreener.com/quote/stock/WARNER-BROS-DISCOVERY-I-136094563/news-quality/"
else: from IPython import embed; embed()
# 해당 Ticker News URL 가져오기
url = nasdaq100_codezb.loc[ticker, 'url'] + "&&fpage={}"
page=0
stop=False
while (True):
page+=1
print("URL : {}".format(url.format(page)))
# html 정보 가져오기
html = urllib.request.urlopen(url.format(page)).read()
soup = BeautifulSoup(html, 'html.parser')
# 한 페이지에 있는 뉴스 리스트
news_list = soup.select('td[class="std_txt th_inner"]')[0].select('table[class="tabBody"] tr')
for news in news_list:
# 1. Date
date = news.select('td')[0].text
if ':' in date:
today = datetime.datetime.now()
today = today.strftime("%Y.%m.%d")
date = today
elif '/' in date:
date = date.replace('/', '.')
today = datetime.datetime.now()
today = today.strftime("%Y")
date = today + "." + date
else:
# XXX : 전년도는 해결하지 못함
date = date + ".12.12"#임시날짜
# boundary_date 이후만 크롤링
if date < boundary_date:
stop=True
break
# 2. URL
news_url = "https://www.marketscreener.com/" + news.select('td')[1].select('a')[0]['href']
# 3. Title
news_title = news.select('td')[1].text
if '…' in news_title:
news_title = news_title.replace('/…', '')
if '/' in news_title:
news_title = news_title.replace('/', '|')
# 4. News Form
news_form = news.select('td')[2].text
# A) 크롤링하기 전에, Directory 존재하는지 확인해보기
if not os.path.exists('./news/{}/{}'.format(ticker, date)):
os.mkdir('./news/{}/{}'.format(ticker, date))
# B) 해당 Ticker, Date, Title의 URL을 따로 저장
save_news_url(ticker, date, news_url, news_title)
# C) 여기서부터 뉴스 내용 크롤링
if news_form == 'MT':
# 제목만 있고 뉴스 기사는 없음
# 일단 Pass
pass
elif news_form == 'MD':
get_md(ticker, date, news_url, news_title)
num+=1
elif news_form == 'RE':
get_re(ticker, date, news_url, news_title)
num+=1
elif news_form == 'AQ':
get_aq(ticker, date, news_url, news_title)
num+=1
elif news_form == 'DJ':
# 뉴스 기사들을 요약한 내용들을 모아둔거여서
# 크롤링을 안해도 괜찮을 것 같음
# => 일단 보류
pass
elif (news_form == '') or (news_form == 'PR')or (news_form == 'PU'):
get_(ticker, date, news_url, news_title)
num+=1
elif news_form == 'AN':
get_an(ticker, date, news_url, news_title)
num+=1
elif news_form == 'CI':
# 구독을 해야지 확인할 수 있음
# 크롤링이 안됨
pass
elif news_form == 'BU':
get_bu(ticker, date, news_url, news_title)
num+=1
else:
print('Pass. {}'.format(news_form))
continue
if stop: break
print("\nNumber of Crawling News : {}".format(num))
total_count+=num
return total_count
# 해당 Ticker, Date, Title의 URL을 따로 저장
def save_news_url(ticker, date, url, title):
dir = "dataset_creation/save_news_url.tsv"
if not os.path.exists(dir):
df = pd.DataFrame(columns=['ticker', 'date', 'title', 'url'])
else:
df = pd.read_csv(dir, sep='\t', index_col=0)
filt = (df['ticker'] == ticker) & (df['date'] == date) & (df['title'] == title)
# 처음으로 저장할 때
if len(df[filt]) == 0:
df.loc[len(df)] = [ticker, date, title, url]
# 기존에 저장되어 있었다면, 다시 업데이트
else:
df.loc[filt, 'url'] = url
df.to_csv(dir, sep='\t')
def get_md(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('span[class=clearfix]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
a=soup.select('span[class=clearfix]')[0].select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
a=soup.select('span[class=clearfix]')[0].select('div[id=grantexto]')[0].select('p')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text = soup.select('span[class=clearfix]')[0].select('div[id=grantexto]')[0].select('p')[0].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
def get_re(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text_list=soup.select('div[id=grantexto] p')
text = ''
for i in range(len(text_list)):
text = text + text_list[i].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
def get_aq(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text_list=soup.select('div[id=grantexto] p')
text = ''
for i in range(len(text_list)):
text = text + text_list[i].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
def get_(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text = a[0].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
def get_an(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text_list=soup.select('div[id=grantexto] p')
text = ''
for i in range(len(text_list)):
text = text + text_list[i].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
def get_bu(ticker, date, url, title):
# 제목이 존재한다면, Pass
if os.path.exists('./news/{}/{}/{}.txt'.format(ticker, date, title)):
return
# html 정보 가져오기
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# 검사
a=soup.select('div[id=grantexto]')
if len(a) != 1:
print("ticker : {}, date : {}, url : {}, title : {}".format(ticker, date, url, title))
return
# 시작
text_list=soup.select('div[id=grantexto] p')
text = ''
for i in range(len(text_list)):
text = text + text_list[i].text
file = open('./news/{}/{}/{}.txt'.format(ticker, date, title), 'w')
file.write(text)
file.close()
if __name__ == '__main__':
get_textLength_and_newsCount()
# get_codezb()
nasdaq_dic = pd.DataFrame(nasdaq100_crawling.get_nasdaq100())
nasdaq100_tickers = list(nasdaq_dic.ticker)
# total_count=get_news(nasdaq100_tickers)
total_count=get_news(nasdaq100_tickers, boundary_date="2023.01.01")
# tickers = ['ADP', 'AAPL', 'META']
# total_count=get_news(tickers)
print("total_count : ", total_count)
print("Finish")