Spaces:

kristada673
/

roboadvisor

Runtime error

App Files Files Community

roboadvisor / finnlp /data_sources /social_media /weibo_date_range.py

kristada673

Upload 7 files

ed0a845 almost 2 years ago

raw

history blame contribute delete

5.93 kB

	from finnlp.data_sources.social_media._base import Social_Media_Downloader

	from tqdm import tqdm
	from lxml import etree
	import pandas as pd
	import numpy as np
	import requests
	import datetime
	import time
	import json
	import re

	class Weibo_Date_Range(Social_Media_Downloader):
	def __init__(self, args = {}):
	super().__init__(args)
	if "cookies" not in args.keys():
	raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
	self.cookies = args["cookies"]
	self.dataframe = pd.DataFrame()

	def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01):
	self.date_list = pd.date_range(start_date, end_date)
	for date in tqdm(self.date_list, desc = "Downloading by dates..."):
	date = date.strftime("%Y-%m-%d")
	self._gather_one_day(date, start_hour, end_hour, stock, delay)
	self.dataframe = self.dataframe.reset_index(drop = True)

	def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01):
	if start_hour == 0 and end_hour == 0:
	start_date = datetime.datetime.strptime(date, "%Y-%m-%d")
	end_date = start_date + datetime.timedelta(days=1)
	start_date = start_date.strftime("%Y-%m-%d")
	end_date = end_date.strftime("%Y-%m-%d")
	else:
	start_date = date, end_date = date

	# first page
	all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay)
	# another pages
	if len(all_urls)>1:
	base_url= "https://s.weibo.com/"
	for url_new in all_urls:
	url_new = base_url + url_new
	self._gather_other_pages(date, url_new, delay)

	def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01):

	headers = {
	"cookie": self.cookies,
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
	}

	params = {
	"q": stock,
	"typeall": "1",
	"suball": "1",
	"timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}",
	"Refer":"g",
	"page":"1"
	}

	url = f"https://s.weibo.com/weibo"
	resp = self._request_get(url, headers=headers, params = params)

	if resp is None:
	return "Error"

	if "passport.weibo.com" in resp.url:
	raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")

	res = etree.HTML(resp.content)
	# get all pages
	all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
	items = res.xpath('//div[@class="card-wrap"]')
	for i in items:
	ps = i.xpath('.//div[@class="content"]//p')
	try:
	content = ps[0].xpath(".//text()")
	content = ''.join(content)
	content = content.replace('\n',"")
	content = content.replace(' ',"")
	content = content.replace('\u200b',"")
	except:
	continue

	info = ps[1].xpath(".//text()")
	try:
	date_content = info[1]
	date_content = date_content.replace('\n',"")
	date_content = date_content.replace(' ',"")
	except:
	date_content = np.nan

	try:
	source = info[3]
	except:
	source = np.nan

	tmp = pd.DataFrame([start_date, date_content, source, content]).T
	tmp.columns = ["date","date_content", "source", "content"]
	self.dataframe = pd.concat([self.dataframe, tmp])

	time.sleep(delay)

	return all_pages

	def _gather_other_pages(self, date, url, delay = 0.01):

	headers = {
	"cookie": self.cookies,
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
	}

	resp = self._request_get(url, headers=headers)

	if resp is None:
	return "Error"

	if "passport.weibo.com" in resp.url:
	raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")

	res = etree.HTML(resp.content)
	# get all pages
	all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
	items = res.xpath('//div[@class="card-wrap"]')
	for i in items:
	ps = i.xpath('.//div[@class="content"]//p')
	try:
	content = ps[0].xpath(".//text()")
	content = ''.join(content)
	content = content.replace('\n',"")
	content = content.replace(' ',"")
	content = content.replace('\u200b',"")
	except:
	continue

	info = ps[1].xpath(".//text()")
	try:
	date_content = info[1]
	date_content = date_content.replace('\n',"")
	date_content = date_content.replace(' ',"")
	except:
	date_content = np.nan

	try:
	source = info[3]
	except:
	source = np.nan

	tmp = pd.DataFrame([date, date_content, source, content]).T
	tmp.columns = ["date", "date_content", "source", "content"]
	self.dataframe = pd.concat([self.dataframe, tmp])

	time.sleep(delay)