Spaces:
Runtime error
Runtime error
File size: 3,177 Bytes
9014990 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import json
import pytz
import time
import requests
import pandas as pd
import numpy as np
from lxml import etree
from tqdm import tqdm
from finnlp.data_sources.news._base import News_Downloader
class Sina_Finance_Date_Range(News_Downloader):
def __init__(self, args={}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_date_range_all(self, start_date, end_date):
self.date_list = pd.date_range(start_date, end_date)
for date in tqdm(self.date_list, desc= "Downloading Titles..."):
tmp = self._gather_one_day(date)
self.dataframe = pd.concat([self.dataframe, tmp])
self.dataframe = self.dataframe.reset_index(drop = True)
def _gather_one_day(self, date, delay = 0.1):
end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp()
start_timestamp = end_timestamp - 60 * 60 * 24
res = pd.DataFrame()
for page in range(100):
url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}"
response = self._request_get(url = url)
if response is not None:
response.encoding = 'unicode'
text = response.text
text = json.loads(text, strict=True)
text = text["result"]
text = text["data"]
if len(text) == 0:
break
for i in text:
for ii in i.keys():
i[ii] = [i[ii]]
tmp = pd.DataFrame(i)
res = pd.concat([res, tmp])
time.sleep(delay)
if res.shape[0] != 0:
res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True)
res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True)
res.intime = pd.to_datetime(res.intime, unit="s", utc=True)
tz = pytz.timezone("Asia/Shanghai")
res.ctime = [t.astimezone(tz) for t in res.ctime]
res.mtime = [t.astimezone(tz) for t in res.mtime]
res.intime = [t.astimezone(tz) for t in res.intime]
return res
def gather_content(self, delay = 0.01):
pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
def _gather_content_apply(self,x, pbar, delay = 0.01):
url = x.url
response = self._request_get(url=url)
if response is not None:
# process
response.encoding = 'unicode'
text = response.text
page = etree.HTML(text)
page = page.xpath("//*[@id='artibody']/p")
page = [p.xpath(".//text()") for p in page]
page = [''.join(p) for p in page]
content = "\n".join(page)
content = content.replace("\u3000","")
else:
content = np.nan
# update
pbar.update(1)
time.sleep(delay)
return content
|