# Inference Data
# get company news online
from datetime import date
import akshare as ak
import pandas as pd
from datetime import date, datetime, timedelta
from Ashare_data import *
#default symbol
symbol = "600519"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<>\n", "\n<>\n\n"
def get_curday():
return date.today().strftime("%Y%m%d")
def n_weeks_before(date_string, n, format = "%Y%m%d"):
date = datetime.strptime(date_string, "%Y%m%d") - timedelta(days=7*n)
return date.strftime(format=format)
def get_news(symbol, max_page = 3):
df_list = []
for page in range(1, max_page):
try:
df_list.append(ak.stock_news_em(symbol, page))
except KeyError:
print(str(symbol) + "pages obtained for symbol: " + page)
break
news_df = pd.concat(df_list, ignore_index=True)
return news_df
# get return
def get_cur_return(symbol, start_date, end_date, adjust="qfq"):
"""
date = "yyyymmdd"
"""
# load data
return_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date=start_date, end_date=end_date, adjust=adjust)
# process timestamp
return_data["日期"] = pd.to_datetime(return_data["日期"])
return_data.set_index("日期", inplace=True)
# resample and filled with forward data
weekly_data = return_data["收盘"].resample("W").ffill()
weekly_returns = weekly_data.pct_change()[1:]
weekly_start_prices = weekly_data[:-1]
weekly_end_prices = weekly_data[1:]
weekly_data = pd.DataFrame({
'起始日期': weekly_start_prices.index,
'起始价': weekly_start_prices.values,
'结算日期': weekly_end_prices.index,
'结算价': weekly_end_prices.values,
'周收益': weekly_returns.values
})
weekly_data["简化周收益"] = weekly_data["周收益"].map(return_transform)
# check enddate
if weekly_data.iloc[-1, 2] > pd.to_datetime(end_date):
weekly_data.iloc[-1, 2] = pd.to_datetime(end_date)
return weekly_data
# get basics
def cur_financial_data(symbol, start_date, end_date, with_basics = True):
# get data
data = get_cur_return(symbol=symbol, start_date=start_date, end_date=end_date)
news_df = get_news(symbol=symbol)
news_df["发布时间"] = pd.to_datetime(news_df["发布时间"], exact=False, format="%Y-%m-%d")
news_df.sort_values(by=["发布时间"], inplace=True)
# match weekly news for return data
news_list = []
for a, row in data.iterrows():
week_start_date = row['起始日期'].strftime('%Y-%m-%d')
week_end_date = row['结算日期'].strftime('%Y-%m-%d')
print(symbol, ': ', week_start_date, ' - ', week_end_date)
weekly_news = news_df.loc[(news_df["发布时间"]>week_start_date) & (news_df["发布时间"]