Spaces:
Runtime error
Runtime error
File size: 6,602 Bytes
de6e775 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
from typing import List
import numpy as np
import pandas as pd
import pytz
import yfinance as yf
try:
import exchange_calendars as tc
except:
print(
"Cannot import exchange_calendars.",
"If you are using python>=3.7, please install it.",
)
import trading_calendars as tc
print("Use trading_calendars instead for yahoofinance processor..")
from finnlp.utils.config import (
BINANCE_BASE_URL,
TIME_ZONE_BERLIN,
TIME_ZONE_JAKARTA,
TIME_ZONE_PARIS,
TIME_ZONE_SELFDEFINED,
TIME_ZONE_SHANGHAI,
TIME_ZONE_USEASTERN,
USE_TIME_ZONE_SELFDEFINED,
)
from finnlp.data_processors._base import _Base, calc_time_zone
class Yahoofinance(_Base):
def __init__(
self,
data_source: str,
start_date: str,
end_date: str,
time_interval: str,
**kwargs,
):
super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
def download_data(
self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
):
self.time_zone = calc_time_zone(
ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
)
self.dataframe = pd.DataFrame()
for tic in ticker_list:
temp_df = yf.download(
tic,
start=self.start_date,
end=self.end_date,
interval=self.time_interval,
)
temp_df["tic"] = tic
self.dataframe = pd.concat([self.dataframe, temp_df], axis=0, join="outer")
self.dataframe.reset_index(inplace=True)
try:
self.dataframe.columns = [
"date",
"open",
"high",
"low",
"close",
"adjusted_close",
"volume",
"tic",
]
except NotImplementedError:
print("the features are not supported currently")
self.dataframe["day"] = self.dataframe["date"].dt.dayofweek
print(self.dataframe)
self.dataframe["date"] = self.dataframe.date.apply(
lambda x: x.strftime("%Y-%m-%d")
)
self.dataframe.dropna(inplace=True)
self.dataframe.reset_index(drop=True, inplace=True)
print("Shape of DataFrame: ", self.dataframe.shape)
self.dataframe.sort_values(by=["date", "tic"], inplace=True)
self.dataframe.reset_index(drop=True, inplace=True)
self.save_data(save_path)
print(
f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
)
def clean_data(self):
df = self.dataframe.copy()
df = df.rename(columns={"date": "time"})
time_interval = self.time_interval
tic_list = np.unique(df.tic.values)
trading_days = self.get_trading_days(start=self.start_date, end=self.end_date)
if time_interval == "1D":
times = trading_days
elif time_interval == "1Min":
times = []
for day in trading_days:
current_time = pd.Timestamp(day + " 09:30:00").tz_localize(
self.time_zone
)
for _ in range(390):
times.append(current_time)
current_time += pd.Timedelta(minutes=1)
else:
raise ValueError(
"Data clean at given time interval is not supported for YahooFinance data."
)
new_df = pd.DataFrame()
for tic in tic_list:
print(("Clean data for ") + tic)
tmp_df = pd.DataFrame(
columns=[
"open",
"high",
"low",
"close",
"adjusted_close",
"volume",
],
index=times,
)
# get data for current ticker
tic_df = df[df.tic == tic]
# fill empty DataFrame using orginal data
for i in range(tic_df.shape[0]):
tmp_df.loc[tic_df.iloc[i]["time"]] = tic_df.iloc[i][
[
"open",
"high",
"low",
"close",
"adjusted_close",
"volume",
]
]
# if close on start date is NaN, fill data with first valid close
# and set volume to 0.
if str(tmp_df.iloc[0]["close"]) == "nan":
print("NaN data on start date, fill using first valid data.")
for i in range(tmp_df.shape[0]):
if str(tmp_df.iloc[i]["close"]) != "nan":
first_valid_close = tmp_df.iloc[i]["close"]
first_valid_adjclose = tmp_df.iloc[i]["adjusted_close"]
tmp_df.iloc[0] = [
first_valid_close,
first_valid_close,
first_valid_close,
first_valid_close,
first_valid_adjclose,
0.0,
]
# fill NaN data with previous close and set volume to 0.
for i in range(tmp_df.shape[0]):
if str(tmp_df.iloc[i]["close"]) == "nan":
previous_close = tmp_df.iloc[i - 1]["close"]
previous_adjusted_close = tmp_df.iloc[i - 1]["adjusted_close"]
if str(previous_close) == "nan":
raise ValueError
tmp_df.iloc[i] = [
previous_close,
previous_close,
previous_close,
previous_close,
previous_adjusted_close,
0.0,
]
# merge single ticker data to new DataFrame
tmp_df = tmp_df.astype(float)
tmp_df["tic"] = tic
new_df = new_df.append(tmp_df)
print(("Data clean for ") + tic + (" is finished."))
# reset index and rename columns
new_df = new_df.reset_index()
new_df = new_df.rename(columns={"index": "time"})
print("Data clean all finished!")
self.dataframe = new_df
def get_trading_days(self, start, end):
nyse = tc.get_calendar("NYSE")
df = nyse.sessions_in_range(pd.Timestamp(start), pd.Timestamp(end))
return [str(day)[:10] for day in df]
|