Spaces:
Sleeping
Sleeping
import pandas as pd | |
import pandas_datareader as pdr | |
import numpy as np | |
import yfinance as yf | |
import requests | |
from bs4 import BeautifulSoup | |
from typing import List | |
from tqdm import tqdm | |
import os | |
import datetime | |
import json | |
from pandas.tseries.offsets import BDay | |
from sqlalchemy import create_engine | |
# from dotenv import load_dotenv | |
# load_dotenv() | |
data_start_date = '2018-07-01' | |
def get_daily(mode='daily', periods_30m=None): | |
''' | |
Method to get daily data and create daily features. Optionally append intra data if specified. | |
`mode`: 'daily' or 'intra'. | |
`periods_30m`: How many 30m periods to bring in. Only specify if mode == 'intra'. | |
''' | |
vix = yf.Ticker('^VIX') | |
vvix = yf.Ticker('^VVIX') | |
spx = yf.Ticker('^GSPC') | |
# Grab data from db | |
engine = create_engine( | |
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \ | |
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \ | |
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY" | |
) | |
query = f'''SELECT | |
spx.Datetime AS Datetime, | |
spx.Open AS Open, | |
spx.High AS High, | |
spx.Low AS Low, | |
spx.Close AS Close, | |
vix.Open AS Open_VIX, | |
vix.High AS High_VIX, | |
vix.Low AS Low_VIX, | |
vix.Close AS Close_VIX, | |
vvix.Open AS Open_VVIX, | |
vvix.High AS High_VVIX, | |
vvix.Low AS Low_VVIX, | |
vvix.Close AS Close_VVIX | |
FROM | |
SPX_full_1day AS spx | |
LEFT JOIN | |
VIX_full_1day AS vix ON spx.Datetime = vix.Datetime AND vix.Datetime > '{data_start_date}' | |
LEFT JOIN | |
VVIX_full_1day AS vvix ON spx.Datetime = vvix.Datetime AND vvix.Datetime > '{data_start_date}' | |
WHERE | |
spx.Datetime > '{data_start_date}' | |
''' | |
data = pd.read_sql_query(sql=query, con=engine.connect()) | |
data['Datetime'] = pd.to_datetime(data['Datetime']) | |
data = data.set_index('Datetime',drop=True) | |
# Get incremental date | |
last_date = data.index.date[-1] | |
last_date = last_date + BDay(1) | |
prices_vix = vix.history(start=last_date, interval='1d') | |
prices_vvix = vvix.history(start=last_date, interval='1d') | |
prices_spx = spx.history(start=last_date, interval='1d') | |
if len(prices_spx) > 0: | |
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index] | |
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date | |
prices_spx.index = prices_spx['index'] | |
prices_spx = prices_spx.drop(columns='index') | |
prices_spx.index = pd.DatetimeIndex(prices_spx.index) | |
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index] | |
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date | |
prices_vix.index = prices_vix['index'] | |
prices_vix = prices_vix.drop(columns='index') | |
prices_vix.index = pd.DatetimeIndex(prices_vix.index) | |
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index] | |
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date | |
prices_vvix.index = prices_vvix['index'] | |
prices_vvix = prices_vvix.drop(columns='index') | |
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index) | |
data1 = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX']) | |
data1 = data1.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX']) | |
data = pd.concat([data, data1]) | |
else: | |
data = data.copy() | |
if mode == 'intra': | |
from getIntraData import get_intra | |
df_intra = get_intra(periods_30m) | |
data = data.merge(df_intra, left_index=True, right_index=True) | |
else: | |
data = data.copy() | |
# Features | |
data['PrevClose'] = data['Close'].shift(1) | |
data['Perf5Day'] = data['Close'] > data['Close'].shift(5) | |
data['Perf5Day_n1'] = data['Perf5Day'].shift(1) | |
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool) | |
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1 | |
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1 | |
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5) | |
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool) | |
data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5) | |
data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool) | |
data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1) | |
data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1) | |
data['VIXOpen'] = data['VIXOpen'].astype(bool) | |
data['VVIXOpen'] = data['VVIXOpen'].astype(bool) | |
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points | |
data['RangePct'] = data['Range'] / data['Close'] | |
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4) | |
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1) | |
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1) | |
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1) | |
data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool) | |
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1) | |
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float) | |
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1) | |
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float) | |
data['RangePct_n1'] = data['RangePct'].shift(1) | |
data['RangePct_n2'] = data['RangePct'].shift(2) | |
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1) | |
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2) | |
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose'] | |
data['CurrentGapHist'] = data['CurrentGap'].copy() | |
data['CurrentGap'] = data['CurrentGap'].shift(-1) | |
data['DayOfWeek'] = pd.to_datetime(data.index) | |
data['DayOfWeek'] = data['DayOfWeek'].dt.day | |
# Target -- the next day's low | |
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1 | |
data['Target'] = data['Target'].shift(-1) | |
# data['Target'] = data['RangePct'].shift(-1) | |
# Target for clf -- whether tomorrow will close above or below today's close | |
data['Target_clf'] = data['Close'] > data['PrevClose'] | |
data['Target_clf'] = data['Target_clf'].shift(-1) | |
data['DayOfWeek'] = pd.to_datetime(data.index) | |
data['Quarter'] = data['DayOfWeek'].dt.quarter | |
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday | |
# Calculate up | |
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1) | |
# Calculate upSD | |
data['upSD'] = data['up'].rolling(30).std(ddof=0) | |
# Calculate aveUp | |
data['aveUp'] = data['up'].rolling(30).mean() | |
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open'] | |
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open'] | |
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1) | |
data['downSD'] = data['down'].rolling(30).std(ddof=0) | |
data['aveDown'] = data['down'].rolling(30).mean() | |
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open'] | |
data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open'] | |
data = data.assign( | |
L1Touch = lambda x: x['Low'] < x['L1'], | |
L2Touch = lambda x: x['Low'] < x['L2'], | |
H1Touch = lambda x: x['High'] > x['H1'], | |
H2Touch = lambda x: x['High'] > x['H2'], | |
L1Break = lambda x: x['Close'] < x['L1'], | |
L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']), | |
L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']), | |
L2Break = lambda x: x['Close'] < x['L2'], | |
H1Break = lambda x: x['Close'] > x['H1'], | |
H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']), | |
H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']), | |
H2Break = lambda x: x['Close'] > x['H2'], | |
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0), | |
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0), | |
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0), | |
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0) | |
) | |
data['OpenL1'] = data['OpenL1'].shift(-1) | |
data['OpenL2'] = data['OpenL2'].shift(-1) | |
data['OpenH1'] = data['OpenH1'].shift(-1) | |
data['OpenH2'] = data['OpenH2'].shift(-1) | |
level_cols = [ | |
'L1Touch', | |
'L2Touch', | |
'H1Touch', | |
'H2Touch', | |
'L1Break', | |
'L2Break', | |
'H1Break', | |
'H2Break' | |
] | |
for col in level_cols: | |
data[col+'Pct'] = data[col].rolling(100).mean() | |
# data[col+'Pct'] = data[col+'Pct'].shift(-1) | |
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum() | |
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum() | |
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum() | |
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum() | |
data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum() | |
data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum() | |
data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum() | |
data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum() | |
if mode=='intra': | |
# Intraday features | |
data['CurrentOpen30'] = data['Open30'].shift(-1) | |
data['CurrentHigh30'] = data['High30'].shift(-1) | |
data['CurrentLow30'] = data['Low30'].shift(-1) | |
data['CurrentClose30'] = data['Close30'].shift(-1) | |
data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1) | |
data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4'] | |
data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool) | |
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1 | |
data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1) | |
data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1) | |
data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX'] | |
# Open to High | |
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1 | |
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1 | |
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1 | |
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close'] | |
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])] | |
data['CloseL1'] = np.where(data['Close30'] < data['L1'], 1, 0) | |
data['CloseL2'] = np.where(data['Close30'] < data['L2'], 1, 0) | |
data['CloseH1'] = np.where(data['Close30'] > data['H1'], 1, 0) | |
data['CloseH2'] = np.where(data['Close30'] > data['H2'], 1, 0) | |
data['CloseL1'] = data['CloseL1'].shift(-1) | |
data['CloseL2'] = data['CloseL2'].shift(-1) | |
data['CloseH1'] = data['CloseH1'].shift(-1) | |
data['CloseH2'] = data['CloseH2'].shift(-1) | |
def get_quintiles(df, col_name, q): | |
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean() | |
probas = [] | |
# Given the current price level | |
for i, pct in enumerate(data['CurrentClose30toClose']): | |
try: | |
# Split | |
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10) | |
for q in df_q.index: | |
if q.left <= pct <= q.right: | |
p = df_q[q] | |
except: | |
p = None | |
probas.append(p) | |
data['GreenProbas'] = probas | |
engine = create_engine( | |
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \ | |
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \ | |
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY" | |
) | |
df_releases = pd.read_sql_query('select * from releases', con=engine) | |
df_releases = df_releases.set_index('Datetime') | |
data = data.merge(df_releases, how = 'left', left_index=True, right_index=True) | |
for n in tqdm(df_releases.columns, desc='Merging econ data'): | |
# Get the name of the release | |
# n = releases[rid]['name'] | |
# Merge the corresponding DF of the release | |
# data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True) | |
# Create a column that shifts the value in the merged column up by 1 | |
data[f'{n}_shift'] = data[n].shift(-1) | |
# Fill the rest with zeroes | |
data[n] = data[n].fillna(0) | |
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0) | |
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1) | |
def cumul_sum(col): | |
nums = [] | |
s = 0 | |
for x in col: | |
if x == 1: | |
s += 1 | |
elif x == 0: | |
s = 0 | |
nums.append(s) | |
return nums | |
consec_green = cumul_sum(data['GreenDay'].values) | |
consec_red = cumul_sum(data['RedDay'].values) | |
data['DaysGreen'] = consec_green | |
data['DaysRed'] = consec_red | |
final_row = data.index[-2] | |
if mode=='daily': | |
from dailyCols import model_cols | |
elif mode=='intra': | |
from intraCols import model_cols | |
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']] | |
df_final = df_final.dropna(subset=['Target','Target_clf']) | |
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1']) | |
return data, df_final, final_row |