Spaces:
Runtime error
Runtime error
File size: 3,647 Bytes
ed0a845 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import warnings
warnings.filterwarnings("ignore")
from finnlp.data_sources.social_media._base import Social_Media_Downloader
import requests
from urllib import parse
from tqdm import tqdm
from datetime import datetime,timedelta
import pandas as pd
import json
import time
class Twitter_Date_Range(Social_Media_Downloader):
def __init__(self, args = {}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
self.date_list = pd.date_range(start_date,end_date)
res = pd.DataFrame()
for date in tqdm(self.date_list, desc= "Downloading by day... "):
tmp = self._gather_one_day(date,stock)
res = pd.concat([res,tmp])
res.created_at = pd.to_datetime(res.created_at)
res = res.sort_values("created_at")
res = res.reset_index(drop=True)
# res = res.query(f"created_at >= @start_date & created_at <= @end_date")
res = res[res.created_at >= start_date][res.created_at <= end_date]
res = res.reset_index(drop=True)
self.dataframe = res
def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01):
time.sleep(delay)
next_date = date + timedelta(days=1)
date = datetime.strftime(date, "%Y-%m-%d")
next_date = datetime.strftime(next_date, "%Y-%m-%d")
url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo"
url_token = 'https://api.twitter.com/1.1/guest/activate.json'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'x-guest-token': '',
'x-twitter-client-language': 'zh-cn',
'x-twitter-active-user': 'yes',
'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'Referer': 'https://twitter.com/',
'Connection': 'keep-alive',
}
q = f'{stock} until:{next_date} since:{date}'
token = json.loads(requests.post(url_token, headers = headers).text)['guest_token']
print(token)
headers['x-guest-token'] = token
url = url.format(parse.quote(q))
print(url)
res = self._request_get(url, headers = headers)
print(res)
if res is not None:
try:
res = json.loads(res.text)
res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at")
except:
res = pd.DataFrame()
else:
res = pd.DataFrame()
return res
|