Spaces:
Sleeping
Sleeping
cleapup
Browse files- README.md +5 -0
- getDailyData.py +75 -27
- getIntraData.py +5 -5
- model_1h.py +0 -481
- model_30m.py +0 -506
- model_90m.py +0 -481
- model_day.py +0 -434
- model_intra.py +0 -531
- model_intra_v2.py +52 -40
- requirements.txt +2 -1
- troubleshoot_day_model.ipynb +0 -0
README.md
CHANGED
@@ -9,6 +9,11 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
# TL;DR on WTF
|
13 |
- The purpose of this project is to predict whether the current day's close will be above the previous day's close (`Target`).
|
14 |
- Predictions are produced through generalized stacking of an ensemble of 2 models.
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# Gameday SPX: An ML Approach to Predicting π₯ or π©
|
13 |
+
Given the proliferation of trading styles widely available today to retail investors, there are so many different ways to make money in today's market. This model is an effort to simplify my personal trading style by gathering most of the features that I would consider important, feed them to a model and get an output for whether the day will be green for $SPX (higher than yesterday's close) or red (lower than yesterday's close).
|
14 |
+
|
15 |
+
This is specific for $SPX, because I primarily trade this index. Justification for
|
16 |
+
|
17 |
# TL;DR on WTF
|
18 |
- The purpose of this project is to predict whether the current day's close will be above the previous day's close (`Target`).
|
19 |
- Predictions are produced through generalized stacking of an ensemble of 2 models.
|
getDailyData.py
CHANGED
@@ -9,7 +9,10 @@ from tqdm import tqdm
|
|
9 |
import os
|
10 |
import datetime
|
11 |
import json
|
|
|
12 |
from sqlalchemy import create_engine
|
|
|
|
|
13 |
|
14 |
data_start_date = '2018-07-01'
|
15 |
|
@@ -24,37 +27,82 @@ def get_daily(mode='daily', periods_30m=None):
|
|
24 |
vvix = yf.Ticker('^VVIX')
|
25 |
spx = yf.Ticker('^GSPC')
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
if mode == 'intra':
|
50 |
from getIntraData import get_intra
|
51 |
df_intra = get_intra(periods_30m)
|
52 |
-
data =
|
53 |
else:
|
54 |
-
data =
|
55 |
-
|
56 |
-
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
57 |
-
data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
58 |
|
59 |
# Features
|
60 |
data['PrevClose'] = data['Close'].shift(1)
|
|
|
9 |
import os
|
10 |
import datetime
|
11 |
import json
|
12 |
+
from pandas.tseries.offsets import BDay
|
13 |
from sqlalchemy import create_engine
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
load_dotenv()
|
16 |
|
17 |
data_start_date = '2018-07-01'
|
18 |
|
|
|
27 |
vvix = yf.Ticker('^VVIX')
|
28 |
spx = yf.Ticker('^GSPC')
|
29 |
|
30 |
+
# Grab data from db
|
31 |
+
engine = create_engine(
|
32 |
+
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
|
33 |
+
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
|
34 |
+
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
|
35 |
+
)
|
36 |
+
|
37 |
+
query = f'''SELECT
|
38 |
+
spx.Datetime AS Datetime,
|
39 |
+
spx.Open AS Open,
|
40 |
+
spx.High AS High,
|
41 |
+
spx.Low AS Low,
|
42 |
+
spx.Close AS Close,
|
43 |
+
vix.Open AS Open_VIX,
|
44 |
+
vix.High AS High_VIX,
|
45 |
+
vix.Low AS Low_VIX,
|
46 |
+
vix.Close AS Close_VIX,
|
47 |
+
vvix.Open AS Open_VVIX,
|
48 |
+
vvix.High AS High_VVIX,
|
49 |
+
vvix.Low AS Low_VVIX,
|
50 |
+
vvix.Close AS Close_VVIX
|
51 |
+
FROM
|
52 |
+
SPX_full_1day AS spx
|
53 |
+
LEFT JOIN
|
54 |
+
VIX_full_1day AS vix ON spx.Datetime = vix.Datetime AND vix.Datetime > '{data_start_date}'
|
55 |
+
LEFT JOIN
|
56 |
+
VVIX_full_1day AS vvix ON spx.Datetime = vvix.Datetime AND vvix.Datetime > '{data_start_date}'
|
57 |
+
WHERE
|
58 |
+
spx.Datetime > '{data_start_date}'
|
59 |
+
|
60 |
+
'''
|
61 |
+
data = pd.read_sql_query(sql=query, con=engine.connect())
|
62 |
+
data['Datetime'] = pd.to_datetime(data['Datetime'])
|
63 |
+
data = data.set_index('Datetime',drop=True)
|
64 |
+
|
65 |
+
# Get incremental date
|
66 |
+
last_date = data.index.date[-1]
|
67 |
+
last_date = last_date + BDay(1)
|
68 |
+
|
69 |
+
prices_vix = vix.history(start=last_date, interval='1d')
|
70 |
+
prices_vvix = vvix.history(start=last_date, interval='1d')
|
71 |
+
prices_spx = spx.history(start=last_date, interval='1d')
|
72 |
+
|
73 |
+
if len(prices_spx) > 0:
|
74 |
+
|
75 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
76 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
77 |
+
prices_spx.index = prices_spx['index']
|
78 |
+
prices_spx = prices_spx.drop(columns='index')
|
79 |
+
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
80 |
+
|
81 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
82 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
83 |
+
prices_vix.index = prices_vix['index']
|
84 |
+
prices_vix = prices_vix.drop(columns='index')
|
85 |
+
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
86 |
+
|
87 |
+
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
|
88 |
+
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
|
89 |
+
prices_vvix.index = prices_vvix['index']
|
90 |
+
prices_vvix = prices_vvix.drop(columns='index')
|
91 |
+
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
|
92 |
+
|
93 |
+
data1 = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
94 |
+
data1 = data1.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
95 |
+
data = pd.concat([data, data1])
|
96 |
+
|
97 |
+
else:
|
98 |
+
data = data.copy()
|
99 |
+
|
100 |
if mode == 'intra':
|
101 |
from getIntraData import get_intra
|
102 |
df_intra = get_intra(periods_30m)
|
103 |
+
data = data.merge(df_intra, left_index=True, right_index=True)
|
104 |
else:
|
105 |
+
data = data.copy()
|
|
|
|
|
|
|
106 |
|
107 |
# Features
|
108 |
data['PrevClose'] = data['Close'].shift(1)
|
getIntraData.py
CHANGED
@@ -6,10 +6,10 @@ import datetime
|
|
6 |
from sqlalchemy import create_engine
|
7 |
import os
|
8 |
from getDailyData import data_start_date
|
9 |
-
|
10 |
|
11 |
# Load environment variables from the .env file
|
12 |
-
|
13 |
|
14 |
def get_intra(periods_30m = 1):
|
15 |
'''
|
@@ -38,11 +38,11 @@ def get_intra(periods_30m = 1):
|
|
38 |
FROM
|
39 |
SPX_full_30min AS spx30
|
40 |
LEFT JOIN
|
41 |
-
VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > {data_start_date}
|
42 |
LEFT JOIN
|
43 |
-
VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > {data_start_date}
|
44 |
WHERE
|
45 |
-
spx30.Datetime > {data_start_date}
|
46 |
|
47 |
'''
|
48 |
# spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
|
|
|
6 |
from sqlalchemy import create_engine
|
7 |
import os
|
8 |
from getDailyData import data_start_date
|
9 |
+
from dotenv import load_dotenv
|
10 |
|
11 |
# Load environment variables from the .env file
|
12 |
+
load_dotenv()
|
13 |
|
14 |
def get_intra(periods_30m = 1):
|
15 |
'''
|
|
|
38 |
FROM
|
39 |
SPX_full_30min AS spx30
|
40 |
LEFT JOIN
|
41 |
+
VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > '{data_start_date}'
|
42 |
LEFT JOIN
|
43 |
+
VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > '{data_start_date}'
|
44 |
WHERE
|
45 |
+
spx30.Datetime > '{data_start_date}'
|
46 |
|
47 |
'''
|
48 |
# spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
|
model_1h.py
DELETED
@@ -1,481 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import json
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
-
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
-
import os
|
15 |
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
-
import datetime
|
17 |
-
from pandas.tseries.offsets import BDay
|
18 |
-
from datasets import load_dataset
|
19 |
-
import lightgbm as lgb
|
20 |
-
|
21 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
22 |
-
|
23 |
-
# Create an XGBRegressor model
|
24 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
25 |
-
model = linear_model.LinearRegression()
|
26 |
-
|
27 |
-
overall_results = []
|
28 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
29 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
30 |
-
# Split the data into training and test sets
|
31 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
32 |
-
y_train = df[target_column].iloc[:i]
|
33 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
34 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
35 |
-
|
36 |
-
# Fit the model to the training data
|
37 |
-
model.fit(X_train, y_train)
|
38 |
-
|
39 |
-
# Make a prediction on the test data
|
40 |
-
predictions = model.predict(X_test)
|
41 |
-
|
42 |
-
# Create a DataFrame to store the true and predicted values
|
43 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
44 |
-
|
45 |
-
overall_results.append(result_df)
|
46 |
-
|
47 |
-
df_results = pd.concat(overall_results)
|
48 |
-
# model.save_model('model_lr.bin')
|
49 |
-
# Return the true and predicted values, and fitted model
|
50 |
-
return df_results, model
|
51 |
-
|
52 |
-
model_cols = [
|
53 |
-
'BigNewsDay',
|
54 |
-
'Quarter',
|
55 |
-
'Perf5Day',
|
56 |
-
'Perf5Day_n1',
|
57 |
-
'DaysGreen',
|
58 |
-
'DaysRed',
|
59 |
-
'CurrentHigh30toClose',
|
60 |
-
'CurrentLow30toClose',
|
61 |
-
'CurrentClose30toClose',
|
62 |
-
'CurrentRange30',
|
63 |
-
'GapFill30',
|
64 |
-
'CurrentGap',
|
65 |
-
'RangePct',
|
66 |
-
'RangePct_n1',
|
67 |
-
'RangePct_n2',
|
68 |
-
'OHLC4_VIX',
|
69 |
-
'OHLC4_VIX_n1',
|
70 |
-
'OHLC4_VIX_n2',
|
71 |
-
'OpenL1',
|
72 |
-
'OpenL2',
|
73 |
-
'OpenH1',
|
74 |
-
'OpenH2',
|
75 |
-
'L1TouchPct',
|
76 |
-
'L2TouchPct',
|
77 |
-
'H1TouchPct',
|
78 |
-
'H2TouchPct',
|
79 |
-
'L1BreakPct',
|
80 |
-
'L2BreakPct',
|
81 |
-
'H1BreakPct',
|
82 |
-
'H2BreakPct',
|
83 |
-
'GreenProbas',
|
84 |
-
# 'GapFillGreenProba'
|
85 |
-
|
86 |
-
]
|
87 |
-
|
88 |
-
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
89 |
-
|
90 |
-
# Create run the regression model to get its target
|
91 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
92 |
-
# joblib.dump(model1, 'model1.bin')
|
93 |
-
|
94 |
-
# Merge the result df back on the df for feeding into the classifier
|
95 |
-
for_merge = res[['Predicted']]
|
96 |
-
for_merge.columns = ['RegrModelOut']
|
97 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
98 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
99 |
-
df = df.drop(columns=[target_column_regr])
|
100 |
-
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
101 |
-
|
102 |
-
df[target_column_clf] = df[target_column_clf].astype(bool)
|
103 |
-
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
104 |
-
|
105 |
-
# Create an XGBRegressor model
|
106 |
-
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
107 |
-
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
108 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
109 |
-
|
110 |
-
overall_results = []
|
111 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
112 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
113 |
-
# Split the data into training and test sets
|
114 |
-
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
115 |
-
y_train = df[target_column_clf].iloc[:i]
|
116 |
-
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
117 |
-
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
118 |
-
|
119 |
-
# Fit the model to the training data
|
120 |
-
model2.fit(X_train, y_train)
|
121 |
-
|
122 |
-
# Make a prediction on the test data
|
123 |
-
predictions = model2.predict_proba(X_test)[:,-1]
|
124 |
-
|
125 |
-
# Create a DataFrame to store the true and predicted values
|
126 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
127 |
-
|
128 |
-
overall_results.append(result_df)
|
129 |
-
|
130 |
-
df_results = pd.concat(overall_results)
|
131 |
-
# model1.save_model('model_ensemble.bin')
|
132 |
-
# joblib.dump(model2, 'model2.bin')
|
133 |
-
# Return the true and predicted values, and fitted model
|
134 |
-
return df_results, model1, model2
|
135 |
-
|
136 |
-
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
137 |
-
regr_pred = trained_reg_model.predict(df)
|
138 |
-
regr_pred = regr_pred > 0
|
139 |
-
new_df = df.copy()
|
140 |
-
new_df['RegrModelOut'] = regr_pred
|
141 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
142 |
-
return clf_pred_proba
|
143 |
-
|
144 |
-
def get_data():
|
145 |
-
# f = open('settings.json')
|
146 |
-
# j = json.load(f)
|
147 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
148 |
-
|
149 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
150 |
-
|
151 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
152 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
153 |
-
r = requests.get(release_dates_url)
|
154 |
-
text = r.text
|
155 |
-
soup = BeautifulSoup(text, 'xml')
|
156 |
-
dates = []
|
157 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
158 |
-
dates.append(release_date_tag.text)
|
159 |
-
return dates
|
160 |
-
|
161 |
-
def parse_release_dates_obs(series_id: str) -> List[str]:
|
162 |
-
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
163 |
-
r = requests.get(obs_url)
|
164 |
-
text = r.text
|
165 |
-
soup = BeautifulSoup(text, 'xml')
|
166 |
-
observations = []
|
167 |
-
for observation_tag in soup.find_all('observation'):
|
168 |
-
date = observation_tag.get('date')
|
169 |
-
value = observation_tag.get('value')
|
170 |
-
observations.append((date, value))
|
171 |
-
return observations
|
172 |
-
|
173 |
-
econ_dfs = {}
|
174 |
-
|
175 |
-
econ_tickers = [
|
176 |
-
'WALCL',
|
177 |
-
'NFCI',
|
178 |
-
'WRESBAL'
|
179 |
-
]
|
180 |
-
|
181 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
182 |
-
# p = parse_release_dates_obs(et)
|
183 |
-
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
184 |
-
df = pdr.get_data_fred(et)
|
185 |
-
df.index = df.index.rename('ds')
|
186 |
-
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
187 |
-
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
188 |
-
econ_dfs[et] = df
|
189 |
-
|
190 |
-
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
191 |
-
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
192 |
-
|
193 |
-
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
194 |
-
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
195 |
-
|
196 |
-
release_ids = [
|
197 |
-
"10", # "Consumer Price Index"
|
198 |
-
"46", # "Producer Price Index"
|
199 |
-
"50", # "Employment Situation"
|
200 |
-
"53", # "Gross Domestic Product"
|
201 |
-
"103", # "Discount Rate Meeting Minutes"
|
202 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
203 |
-
"194", # "ADP National Employment Report"
|
204 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
205 |
-
]
|
206 |
-
|
207 |
-
release_names = [
|
208 |
-
"CPI",
|
209 |
-
"PPI",
|
210 |
-
"NFP",
|
211 |
-
"GDP",
|
212 |
-
"FOMC",
|
213 |
-
"UNEMP",
|
214 |
-
"ADP",
|
215 |
-
"PCE"
|
216 |
-
]
|
217 |
-
|
218 |
-
releases = {}
|
219 |
-
|
220 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
221 |
-
releases[rid] = {}
|
222 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
223 |
-
releases[rid]['name'] = n
|
224 |
-
|
225 |
-
# Create a DF that has all dates with the name of the col as 1
|
226 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
227 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
228 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
229 |
-
releases[rid]['df'] = pd.DataFrame(
|
230 |
-
index=releases[rid]['dates'],
|
231 |
-
data={
|
232 |
-
releases[rid]['name']: 1
|
233 |
-
})
|
234 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
235 |
-
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
236 |
-
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
237 |
-
|
238 |
-
vix = yf.Ticker('^VIX')
|
239 |
-
spx = yf.Ticker('^GSPC')
|
240 |
-
|
241 |
-
|
242 |
-
# Pull in data
|
243 |
-
data = load_dataset("boomsss/spx_intra", split='train')
|
244 |
-
|
245 |
-
rows = [d['text'] for d in data]
|
246 |
-
rows = [x.split(',') for x in rows]
|
247 |
-
|
248 |
-
fr = pd.DataFrame(columns=[
|
249 |
-
'Datetime','Open','High','Low','Close'
|
250 |
-
], data = rows)
|
251 |
-
|
252 |
-
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
253 |
-
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
254 |
-
fr = fr.set_index('Datetime')
|
255 |
-
fr['Open'] = pd.to_numeric(fr['Open'])
|
256 |
-
fr['High'] = pd.to_numeric(fr['High'])
|
257 |
-
fr['Low'] = pd.to_numeric(fr['Low'])
|
258 |
-
fr['Close'] = pd.to_numeric(fr['Close'])
|
259 |
-
|
260 |
-
# Get incremental date
|
261 |
-
last_date = fr.index.date[-1]
|
262 |
-
last_date = last_date + datetime.timedelta(days=1)
|
263 |
-
# Get incremental data
|
264 |
-
spx1 = yf.Ticker('^GSPC')
|
265 |
-
yfp = spx1.history(start=last_date, interval='30m')
|
266 |
-
|
267 |
-
if len(yfp) > 0:
|
268 |
-
# Concat current and incremental
|
269 |
-
df_30m = pd.concat([fr, yfp])
|
270 |
-
else:
|
271 |
-
df_30m = fr.copy()
|
272 |
-
|
273 |
-
# Get the first 30 minute bar
|
274 |
-
df_30m = df_30m.reset_index()
|
275 |
-
df_30m['Datetime'] = df_30m['Datetime'].dt.date
|
276 |
-
df_30m = df_30m.groupby('Datetime').head(2)
|
277 |
-
df_30m = df_30m.set_index('Datetime',drop=True)
|
278 |
-
# Rename the columns
|
279 |
-
df_30m = df_30m[['Open','High','Low','Close']]
|
280 |
-
|
281 |
-
opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
|
282 |
-
highs_1h = df_30m.groupby('Datetime')['High'].max()
|
283 |
-
lows_1h = df_30m.groupby('Datetime')['Low'].min()
|
284 |
-
closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
|
285 |
-
|
286 |
-
df_1h = pd.DataFrame(index=df_30m.index.unique())
|
287 |
-
df_1h['Open'] = opens_1h
|
288 |
-
df_1h['High'] = highs_1h
|
289 |
-
df_1h['Low'] = lows_1h
|
290 |
-
df_1h['Close'] = closes_1h
|
291 |
-
|
292 |
-
df_1h.columns = ['Open30','High30','Low30','Close30']
|
293 |
-
|
294 |
-
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
295 |
-
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
296 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
297 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
298 |
-
prices_spx.index = prices_spx['index']
|
299 |
-
prices_spx = prices_spx.drop(columns='index')
|
300 |
-
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
301 |
-
|
302 |
-
|
303 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
304 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
305 |
-
prices_vix.index = prices_vix['index']
|
306 |
-
prices_vix = prices_vix.drop(columns='index')
|
307 |
-
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
308 |
-
|
309 |
-
|
310 |
-
data = prices_spx.merge(df_1h, left_index=True, right_index=True)
|
311 |
-
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
312 |
-
|
313 |
-
# Features
|
314 |
-
data['PrevClose'] = data['Close'].shift(1)
|
315 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
316 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
317 |
-
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
318 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
319 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
320 |
-
|
321 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
322 |
-
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
323 |
-
|
324 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
325 |
-
data['RangePct'] = data['Range'] / data['Close']
|
326 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
327 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
328 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
329 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
330 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
331 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
332 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
333 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
334 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
335 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
336 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
337 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
338 |
-
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
339 |
-
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
340 |
-
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
341 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
342 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
343 |
-
|
344 |
-
# Calculate up
|
345 |
-
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
346 |
-
|
347 |
-
# Calculate upSD
|
348 |
-
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
349 |
-
|
350 |
-
# Calculate aveUp
|
351 |
-
data['aveUp'] = data['up'].rolling(30).mean()
|
352 |
-
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
353 |
-
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
354 |
-
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
355 |
-
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
356 |
-
data['aveDown'] = data['down'].rolling(30).mean()
|
357 |
-
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
358 |
-
data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
|
359 |
-
|
360 |
-
data = data.assign(
|
361 |
-
L1Touch = lambda x: x['Low'] < x['L1'],
|
362 |
-
L2Touch = lambda x: x['Low'] < x['L2'],
|
363 |
-
H1Touch = lambda x: x['High'] > x['H1'],
|
364 |
-
H2Touch = lambda x: x['High'] > x['H2'],
|
365 |
-
L1Break = lambda x: x['Close'] < x['L1'],
|
366 |
-
L2Break = lambda x: x['Close'] < x['L2'],
|
367 |
-
H1Break = lambda x: x['Close'] > x['H1'],
|
368 |
-
H2Break = lambda x: x['Close'] > x['H2'],
|
369 |
-
OpenL1 = lambda x: x['Open'] / x['L1'],
|
370 |
-
OpenL2 = lambda x: x['Open'] / x['L2'],
|
371 |
-
OpenH1 = lambda x: x['Open'] / x['H1'],
|
372 |
-
OpenH2 = lambda x: x['Open'] / x['H2']
|
373 |
-
)
|
374 |
-
|
375 |
-
level_cols = [
|
376 |
-
'L1Touch',
|
377 |
-
'L2Touch',
|
378 |
-
'H1Touch',
|
379 |
-
'H2Touch',
|
380 |
-
'L1Break',
|
381 |
-
'L2Break',
|
382 |
-
'H1Break',
|
383 |
-
'H2Break'
|
384 |
-
]
|
385 |
-
|
386 |
-
for col in level_cols:
|
387 |
-
data[col+'Pct'] = data[col].rolling(100).mean()
|
388 |
-
data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
389 |
-
|
390 |
-
# Intraday features
|
391 |
-
data['CurrentHigh30'] = data['High30'].shift(-1)
|
392 |
-
data['CurrentLow30'] = data['Low30'].shift(-1)
|
393 |
-
data['CurrentClose30'] = data['Close30'].shift(-1)
|
394 |
-
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
395 |
-
|
396 |
-
# Open to High
|
397 |
-
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
398 |
-
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
399 |
-
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
400 |
-
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
401 |
-
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
402 |
-
|
403 |
-
# Target -- the next day's low
|
404 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
405 |
-
data['Target'] = data['Target'].shift(-1)
|
406 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
407 |
-
|
408 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
409 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
410 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
411 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
412 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
413 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
414 |
-
|
415 |
-
def get_quintiles(df, col_name, q):
|
416 |
-
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
417 |
-
|
418 |
-
probas = []
|
419 |
-
for i, pct in enumerate(data['CurrentClose30toClose']):
|
420 |
-
try:
|
421 |
-
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
|
422 |
-
for q in df_q.index:
|
423 |
-
if q.left <= pct <= q.right:
|
424 |
-
p = df_q[q]
|
425 |
-
except:
|
426 |
-
p = None
|
427 |
-
|
428 |
-
probas.append(p)
|
429 |
-
|
430 |
-
# gapfills = []
|
431 |
-
# for i, pct in enumerate(data['CurrentGap']):
|
432 |
-
# try:
|
433 |
-
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
|
434 |
-
# for q in df_q.index:
|
435 |
-
# if q.left <= pct <= q.right:
|
436 |
-
# p = df_q[q]
|
437 |
-
# except:
|
438 |
-
# p = None
|
439 |
-
|
440 |
-
# gapfills.append(p)
|
441 |
-
|
442 |
-
data['GreenProbas'] = probas
|
443 |
-
# data['GapFillGreenProba'] = gapfills
|
444 |
-
|
445 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
446 |
-
# Get the name of the release
|
447 |
-
n = releases[rid]['name']
|
448 |
-
# Merge the corresponding DF of the release
|
449 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
450 |
-
# Create a column that shifts the value in the merged column up by 1
|
451 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
452 |
-
# Fill the rest with zeroes
|
453 |
-
data[n] = data[n].fillna(0)
|
454 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
455 |
-
|
456 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
457 |
-
|
458 |
-
def cumul_sum(col):
|
459 |
-
nums = []
|
460 |
-
s = 0
|
461 |
-
for x in col:
|
462 |
-
if x == 1:
|
463 |
-
s += 1
|
464 |
-
elif x == 0:
|
465 |
-
s = 0
|
466 |
-
nums.append(s)
|
467 |
-
return nums
|
468 |
-
|
469 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
470 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
471 |
-
|
472 |
-
data['DaysGreen'] = consec_green
|
473 |
-
data['DaysRed'] = consec_red
|
474 |
-
|
475 |
-
final_row = data.index[-2]
|
476 |
-
|
477 |
-
exp_row = data.index[-1]
|
478 |
-
|
479 |
-
df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
|
480 |
-
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
481 |
-
return data, df_final, final_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_30m.py
DELETED
@@ -1,506 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import json
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
-
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
-
import os
|
15 |
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
-
import datetime
|
17 |
-
from pandas.tseries.offsets import BDay
|
18 |
-
from datasets import load_dataset
|
19 |
-
import lightgbm as lgb
|
20 |
-
|
21 |
-
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
22 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
23 |
-
|
24 |
-
# Create an XGBRegressor model
|
25 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
26 |
-
model = linear_model.LinearRegression()
|
27 |
-
|
28 |
-
overall_results = []
|
29 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
30 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
31 |
-
# Split the data into training and test sets
|
32 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
33 |
-
y_train = df[target_column].iloc[:i]
|
34 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
35 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
36 |
-
|
37 |
-
# Fit the model to the training data
|
38 |
-
model.fit(X_train, y_train)
|
39 |
-
|
40 |
-
# Make a prediction on the test data
|
41 |
-
predictions = model.predict(X_test)
|
42 |
-
|
43 |
-
# Create a DataFrame to store the true and predicted values
|
44 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
45 |
-
|
46 |
-
overall_results.append(result_df)
|
47 |
-
|
48 |
-
df_results = pd.concat(overall_results)
|
49 |
-
# model.save_model('model_lr.bin')
|
50 |
-
# Return the true and predicted values, and fitted model
|
51 |
-
return df_results, model
|
52 |
-
|
53 |
-
model_cols = [
|
54 |
-
'BigNewsDay',
|
55 |
-
'Quarter',
|
56 |
-
'Perf5Day',
|
57 |
-
'Perf5Day_n1',
|
58 |
-
'DaysGreen',
|
59 |
-
'DaysRed',
|
60 |
-
'CurrentHigh30toClose',
|
61 |
-
'CurrentLow30toClose',
|
62 |
-
'CurrentClose30toClose',
|
63 |
-
'CurrentRange30',
|
64 |
-
'GapFill30',
|
65 |
-
'CurrentGap',
|
66 |
-
'RangePct',
|
67 |
-
'RangePct_n1',
|
68 |
-
'RangePct_n2',
|
69 |
-
'OHLC4_VIX',
|
70 |
-
'OHLC4_VIX_n1',
|
71 |
-
'OHLC4_VIX_n2',
|
72 |
-
'OpenL1',
|
73 |
-
'OpenL2',
|
74 |
-
'OpenH1',
|
75 |
-
'OpenH2',
|
76 |
-
'L1TouchPct',
|
77 |
-
'L2TouchPct',
|
78 |
-
'H1TouchPct',
|
79 |
-
'H2TouchPct',
|
80 |
-
'L1BreakPct',
|
81 |
-
'L2BreakPct',
|
82 |
-
'H1BreakPct',
|
83 |
-
'H2BreakPct',
|
84 |
-
'GreenProbas',
|
85 |
-
# 'GapFillGreenProba'
|
86 |
-
]
|
87 |
-
|
88 |
-
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
89 |
-
|
90 |
-
# Create run the regression model to get its target
|
91 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
92 |
-
# joblib.dump(model1, 'model1.bin')
|
93 |
-
|
94 |
-
# Merge the result df back on the df for feeding into the classifier
|
95 |
-
for_merge = res[['Predicted']]
|
96 |
-
for_merge.columns = ['RegrModelOut']
|
97 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
98 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
99 |
-
df = df.drop(columns=[target_column_regr])
|
100 |
-
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
101 |
-
|
102 |
-
df[target_column_clf] = df[target_column_clf].astype(bool)
|
103 |
-
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
104 |
-
|
105 |
-
# Create an XGBRegressor model
|
106 |
-
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
107 |
-
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
108 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
109 |
-
|
110 |
-
overall_results = []
|
111 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
112 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
113 |
-
# Split the data into training and test sets
|
114 |
-
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
115 |
-
y_train = df[target_column_clf].iloc[:i]
|
116 |
-
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
117 |
-
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
118 |
-
|
119 |
-
# Fit the model to the training data
|
120 |
-
model2.fit(X_train, y_train)
|
121 |
-
|
122 |
-
# Make a prediction on the test data
|
123 |
-
predictions = model2.predict_proba(X_test)[:,-1]
|
124 |
-
|
125 |
-
# Create a DataFrame to store the true and predicted values
|
126 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
127 |
-
|
128 |
-
overall_results.append(result_df)
|
129 |
-
|
130 |
-
df_results = pd.concat(overall_results)
|
131 |
-
|
132 |
-
# Calibrate Probabilities
|
133 |
-
def get_quantiles(df, col_name, q):
|
134 |
-
return df.groupby(pd.qcut(df[col_name], q))['True'].mean()
|
135 |
-
|
136 |
-
greenprobas = []
|
137 |
-
meanprobas = []
|
138 |
-
for i, pct in enumerate(df_results['Predicted']):
|
139 |
-
try:
|
140 |
-
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
141 |
-
for q in df_q.index:
|
142 |
-
if q.left <= pct <= q.right:
|
143 |
-
p = df_q[q]
|
144 |
-
c = (q.left + q.right) / 2
|
145 |
-
except:
|
146 |
-
p = None
|
147 |
-
c = None
|
148 |
-
|
149 |
-
greenprobas.append(p)
|
150 |
-
meanprobas.append(c)
|
151 |
-
|
152 |
-
df_results['CalibPredicted'] = meanprobas
|
153 |
-
df_results['CalibGreenProba'] = greenprobas
|
154 |
-
|
155 |
-
return df_results, model1, model2
|
156 |
-
|
157 |
-
|
158 |
-
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
159 |
-
regr_pred = trained_reg_model.predict(df)
|
160 |
-
regr_pred = regr_pred > 0
|
161 |
-
new_df = df.copy()
|
162 |
-
new_df['RegrModelOut'] = regr_pred
|
163 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
164 |
-
return clf_pred_proba
|
165 |
-
|
166 |
-
def get_data():
|
167 |
-
# f = open('settings.json')
|
168 |
-
# j = json.load(f)
|
169 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
170 |
-
|
171 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
172 |
-
|
173 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
174 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
175 |
-
r = requests.get(release_dates_url)
|
176 |
-
text = r.text
|
177 |
-
soup = BeautifulSoup(text, 'xml')
|
178 |
-
dates = []
|
179 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
180 |
-
dates.append(release_date_tag.text)
|
181 |
-
return dates
|
182 |
-
|
183 |
-
def parse_release_dates_obs(series_id: str) -> List[str]:
|
184 |
-
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
185 |
-
r = requests.get(obs_url)
|
186 |
-
text = r.text
|
187 |
-
soup = BeautifulSoup(text, 'xml')
|
188 |
-
observations = []
|
189 |
-
for observation_tag in soup.find_all('observation'):
|
190 |
-
date = observation_tag.get('date')
|
191 |
-
value = observation_tag.get('value')
|
192 |
-
observations.append((date, value))
|
193 |
-
return observations
|
194 |
-
|
195 |
-
econ_dfs = {}
|
196 |
-
|
197 |
-
econ_tickers = [
|
198 |
-
'WALCL',
|
199 |
-
'NFCI',
|
200 |
-
'WRESBAL'
|
201 |
-
]
|
202 |
-
|
203 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
204 |
-
# p = parse_release_dates_obs(et)
|
205 |
-
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
206 |
-
df = pdr.get_data_fred(et)
|
207 |
-
df.index = df.index.rename('ds')
|
208 |
-
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
209 |
-
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
210 |
-
econ_dfs[et] = df
|
211 |
-
|
212 |
-
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
213 |
-
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
214 |
-
|
215 |
-
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
216 |
-
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
217 |
-
|
218 |
-
release_ids = [
|
219 |
-
"10", # "Consumer Price Index"
|
220 |
-
"46", # "Producer Price Index"
|
221 |
-
"50", # "Employment Situation"
|
222 |
-
"53", # "Gross Domestic Product"
|
223 |
-
"103", # "Discount Rate Meeting Minutes"
|
224 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
225 |
-
"194", # "ADP National Employment Report"
|
226 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
227 |
-
]
|
228 |
-
|
229 |
-
release_names = [
|
230 |
-
"CPI",
|
231 |
-
"PPI",
|
232 |
-
"NFP",
|
233 |
-
"GDP",
|
234 |
-
"FOMC",
|
235 |
-
"UNEMP",
|
236 |
-
"ADP",
|
237 |
-
"PCE"
|
238 |
-
]
|
239 |
-
|
240 |
-
releases = {}
|
241 |
-
|
242 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
243 |
-
releases[rid] = {}
|
244 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
245 |
-
releases[rid]['name'] = n
|
246 |
-
|
247 |
-
# Create a DF that has all dates with the name of the col as 1
|
248 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
249 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
250 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
251 |
-
releases[rid]['df'] = pd.DataFrame(
|
252 |
-
index=releases[rid]['dates'],
|
253 |
-
data={
|
254 |
-
releases[rid]['name']: 1
|
255 |
-
})
|
256 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
257 |
-
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
258 |
-
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
259 |
-
|
260 |
-
vix = yf.Ticker('^VIX')
|
261 |
-
spx = yf.Ticker('^GSPC')
|
262 |
-
|
263 |
-
# Pull in data
|
264 |
-
data = load_dataset("boomsss/spx_intra", split='train')
|
265 |
-
|
266 |
-
rows = [d['text'] for d in data]
|
267 |
-
rows = [x.split(',') for x in rows]
|
268 |
-
|
269 |
-
fr = pd.DataFrame(columns=[
|
270 |
-
'Datetime','Open','High','Low','Close'
|
271 |
-
], data = rows)
|
272 |
-
|
273 |
-
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
274 |
-
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
275 |
-
fr = fr.set_index('Datetime')
|
276 |
-
fr['Open'] = pd.to_numeric(fr['Open'])
|
277 |
-
fr['High'] = pd.to_numeric(fr['High'])
|
278 |
-
fr['Low'] = pd.to_numeric(fr['Low'])
|
279 |
-
fr['Close'] = pd.to_numeric(fr['Close'])
|
280 |
-
|
281 |
-
# Get incremental date
|
282 |
-
last_date = fr.index.date[-1]
|
283 |
-
last_date = last_date + datetime.timedelta(days=1)
|
284 |
-
# Get incremental data
|
285 |
-
spx1 = yf.Ticker('^GSPC')
|
286 |
-
yfp = spx1.history(start=last_date, interval='30m')
|
287 |
-
|
288 |
-
if len(yfp) > 0:
|
289 |
-
# Concat current and incremental
|
290 |
-
df_30m = pd.concat([fr, yfp])
|
291 |
-
else:
|
292 |
-
df_30m = fr.copy()
|
293 |
-
|
294 |
-
# Get the first 30 minute bar
|
295 |
-
df_30m = df_30m.reset_index()
|
296 |
-
df_30m['Datetime'] = df_30m['Datetime'].dt.date
|
297 |
-
df_30m = df_30m.groupby('Datetime').head(1)
|
298 |
-
df_30m = df_30m.set_index('Datetime',drop=True)
|
299 |
-
# Rename the columns
|
300 |
-
df_30m = df_30m[['Open','High','Low','Close']]
|
301 |
-
df_30m.columns = ['Open30','High30','Low30','Close30']
|
302 |
-
|
303 |
-
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
304 |
-
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
305 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
306 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
307 |
-
prices_spx.index = prices_spx['index']
|
308 |
-
prices_spx = prices_spx.drop(columns='index')
|
309 |
-
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
310 |
-
|
311 |
-
|
312 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
313 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
314 |
-
prices_vix.index = prices_vix['index']
|
315 |
-
prices_vix = prices_vix.drop(columns='index')
|
316 |
-
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
317 |
-
|
318 |
-
|
319 |
-
data = prices_spx.merge(df_30m, left_index=True, right_index=True)
|
320 |
-
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
321 |
-
|
322 |
-
# Features
|
323 |
-
data['PrevClose'] = data['Close'].shift(1)
|
324 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
325 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
326 |
-
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
327 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
328 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
329 |
-
|
330 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
331 |
-
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
332 |
-
|
333 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
334 |
-
data['RangePct'] = data['Range'] / data['Close']
|
335 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
336 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
337 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
338 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
339 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
340 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
341 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
342 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
343 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
344 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
345 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
346 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
347 |
-
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
348 |
-
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
349 |
-
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
350 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
351 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
352 |
-
|
353 |
-
# Intraday features
|
354 |
-
data['CurrentOpen30'] = data['Open30'].shift(-1)
|
355 |
-
data['CurrentHigh30'] = data['High30'].shift(-1)
|
356 |
-
data['CurrentLow30'] = data['Low30'].shift(-1)
|
357 |
-
data['CurrentClose30'] = data['Close30'].shift(-1)
|
358 |
-
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
359 |
-
|
360 |
-
|
361 |
-
# Open to High
|
362 |
-
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
363 |
-
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
364 |
-
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
365 |
-
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
366 |
-
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
367 |
-
|
368 |
-
# Target -- the next day's low
|
369 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
370 |
-
data['Target'] = data['Target'].shift(-1)
|
371 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
372 |
-
|
373 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
374 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
375 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
376 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
377 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
378 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
379 |
-
|
380 |
-
# Calculate up
|
381 |
-
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
382 |
-
|
383 |
-
# Calculate upSD
|
384 |
-
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
385 |
-
|
386 |
-
# Calculate aveUp
|
387 |
-
data['aveUp'] = data['up'].rolling(30).mean()
|
388 |
-
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
389 |
-
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
390 |
-
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
391 |
-
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
392 |
-
data['aveDown'] = data['down'].rolling(30).mean()
|
393 |
-
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
394 |
-
data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
|
395 |
-
|
396 |
-
data = data.assign(
|
397 |
-
L1Touch = lambda x: x['Low'] < x['L1'],
|
398 |
-
L2Touch = lambda x: x['Low'] < x['L2'],
|
399 |
-
H1Touch = lambda x: x['High'] > x['H1'],
|
400 |
-
H2Touch = lambda x: x['High'] > x['H2'],
|
401 |
-
L1Break = lambda x: x['Close'] < x['L1'],
|
402 |
-
L2Break = lambda x: x['Close'] < x['L2'],
|
403 |
-
H1Break = lambda x: x['Close'] > x['H1'],
|
404 |
-
H2Break = lambda x: x['Close'] > x['H2'],
|
405 |
-
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
|
406 |
-
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
407 |
-
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
408 |
-
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
|
409 |
-
CloseL1 = lambda x: np.where(x['Close'] < x['L1'], 1, 0),
|
410 |
-
CloseL2 = lambda x: np.where(x['Close'] < x['L2'], 1, 0),
|
411 |
-
CloseH1 = lambda x: np.where(x['Close'] > x['H1'], 1, 0),
|
412 |
-
CloseH2 = lambda x: np.where(x['Close'] > x['H2'], 1, 0)
|
413 |
-
)
|
414 |
-
|
415 |
-
data['OpenL1'] = data['OpenL1'].shift(-1)
|
416 |
-
data['OpenL2'] = data['OpenL2'].shift(-1)
|
417 |
-
data['OpenH1'] = data['OpenH1'].shift(-1)
|
418 |
-
data['OpenH2'] = data['OpenH2'].shift(-1)
|
419 |
-
data['CloseL1'] = data['CloseL1'].shift(-1)
|
420 |
-
data['CloseL2'] = data['CloseL2'].shift(-1)
|
421 |
-
data['CloseH1'] = data['CloseH1'].shift(-1)
|
422 |
-
data['CloseH2'] = data['CloseH2'].shift(-1)
|
423 |
-
|
424 |
-
level_cols = [
|
425 |
-
'L1Touch',
|
426 |
-
'L2Touch',
|
427 |
-
'H1Touch',
|
428 |
-
'H2Touch',
|
429 |
-
'L1Break',
|
430 |
-
'L2Break',
|
431 |
-
'H1Break',
|
432 |
-
'H2Break'
|
433 |
-
]
|
434 |
-
|
435 |
-
for col in level_cols:
|
436 |
-
data[col+'Pct'] = data[col].rolling(100).mean()
|
437 |
-
data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
438 |
-
|
439 |
-
|
440 |
-
def get_quintiles(df, col_name, q):
|
441 |
-
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
442 |
-
|
443 |
-
probas = []
|
444 |
-
for i, pct in enumerate(data['CurrentClose30toClose']):
|
445 |
-
try:
|
446 |
-
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
|
447 |
-
for q in df_q.index:
|
448 |
-
if q.left <= pct <= q.right:
|
449 |
-
p = df_q[q]
|
450 |
-
except:
|
451 |
-
p = None
|
452 |
-
|
453 |
-
probas.append(p)
|
454 |
-
|
455 |
-
# gapfills = []
|
456 |
-
# for i, pct in enumerate(data['CurrentGap']):
|
457 |
-
# try:
|
458 |
-
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
|
459 |
-
# for q in df_q.index:
|
460 |
-
# if q.left <= pct <= q.right:
|
461 |
-
# p = df_q[q]
|
462 |
-
# except:
|
463 |
-
# p = None
|
464 |
-
|
465 |
-
# gapfills.append(p)
|
466 |
-
|
467 |
-
data['GreenProbas'] = probas
|
468 |
-
# data['GapFillGreenProba'] = gapfills
|
469 |
-
|
470 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
471 |
-
# Get the name of the release
|
472 |
-
n = releases[rid]['name']
|
473 |
-
# Merge the corresponding DF of the release
|
474 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
475 |
-
# Create a column that shifts the value in the merged column up by 1
|
476 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
477 |
-
# Fill the rest with zeroes
|
478 |
-
data[n] = data[n].fillna(0)
|
479 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
480 |
-
|
481 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
482 |
-
|
483 |
-
def cumul_sum(col):
|
484 |
-
nums = []
|
485 |
-
s = 0
|
486 |
-
for x in col:
|
487 |
-
if x == 1:
|
488 |
-
s += 1
|
489 |
-
elif x == 0:
|
490 |
-
s = 0
|
491 |
-
nums.append(s)
|
492 |
-
return nums
|
493 |
-
|
494 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
495 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
496 |
-
|
497 |
-
data['DaysGreen'] = consec_green
|
498 |
-
data['DaysRed'] = consec_red
|
499 |
-
|
500 |
-
final_row = data.index[-2]
|
501 |
-
|
502 |
-
exp_row = data.index[-1]
|
503 |
-
|
504 |
-
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
|
505 |
-
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
506 |
-
return data, df_final, final_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_90m.py
DELETED
@@ -1,481 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import json
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
-
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
-
import os
|
15 |
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
-
import datetime
|
17 |
-
from pandas.tseries.offsets import BDay
|
18 |
-
from datasets import load_dataset
|
19 |
-
import lightgbm as lgb
|
20 |
-
|
21 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
22 |
-
|
23 |
-
# Create an XGBRegressor model
|
24 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
25 |
-
model = linear_model.LinearRegression()
|
26 |
-
|
27 |
-
overall_results = []
|
28 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
29 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
30 |
-
# Split the data into training and test sets
|
31 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
32 |
-
y_train = df[target_column].iloc[:i]
|
33 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
34 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
35 |
-
|
36 |
-
# Fit the model to the training data
|
37 |
-
model.fit(X_train, y_train)
|
38 |
-
|
39 |
-
# Make a prediction on the test data
|
40 |
-
predictions = model.predict(X_test)
|
41 |
-
|
42 |
-
# Create a DataFrame to store the true and predicted values
|
43 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
44 |
-
|
45 |
-
overall_results.append(result_df)
|
46 |
-
|
47 |
-
df_results = pd.concat(overall_results)
|
48 |
-
# model.save_model('model_lr.bin')
|
49 |
-
# Return the true and predicted values, and fitted model
|
50 |
-
return df_results, model
|
51 |
-
|
52 |
-
model_cols = [
|
53 |
-
'BigNewsDay',
|
54 |
-
'Quarter',
|
55 |
-
'Perf5Day',
|
56 |
-
'Perf5Day_n1',
|
57 |
-
'DaysGreen',
|
58 |
-
'DaysRed',
|
59 |
-
'CurrentHigh30toClose',
|
60 |
-
'CurrentLow30toClose',
|
61 |
-
'CurrentClose30toClose',
|
62 |
-
'CurrentRange30',
|
63 |
-
'GapFill30',
|
64 |
-
'CurrentGap',
|
65 |
-
'RangePct',
|
66 |
-
'RangePct_n1',
|
67 |
-
'RangePct_n2',
|
68 |
-
'OHLC4_VIX',
|
69 |
-
'OHLC4_VIX_n1',
|
70 |
-
'OHLC4_VIX_n2',
|
71 |
-
'OpenL1',
|
72 |
-
'OpenL2',
|
73 |
-
'OpenH1',
|
74 |
-
'OpenH2',
|
75 |
-
'L1TouchPct',
|
76 |
-
'L2TouchPct',
|
77 |
-
'H1TouchPct',
|
78 |
-
'H2TouchPct',
|
79 |
-
'L1BreakPct',
|
80 |
-
'L2BreakPct',
|
81 |
-
'H1BreakPct',
|
82 |
-
'H2BreakPct',
|
83 |
-
'GreenProbas',
|
84 |
-
# 'GapFillGreenProba'
|
85 |
-
]
|
86 |
-
|
87 |
-
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
88 |
-
|
89 |
-
# Create run the regression model to get its target
|
90 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
91 |
-
# joblib.dump(model1, 'model1.bin')
|
92 |
-
|
93 |
-
# Merge the result df back on the df for feeding into the classifier
|
94 |
-
for_merge = res[['Predicted']]
|
95 |
-
for_merge.columns = ['RegrModelOut']
|
96 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
97 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
98 |
-
df = df.drop(columns=[target_column_regr])
|
99 |
-
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
100 |
-
|
101 |
-
df[target_column_clf] = df[target_column_clf].astype(bool)
|
102 |
-
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
103 |
-
|
104 |
-
# Create an XGBRegressor model
|
105 |
-
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
106 |
-
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
107 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
108 |
-
|
109 |
-
overall_results = []
|
110 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
111 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
112 |
-
# Split the data into training and test sets
|
113 |
-
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
114 |
-
y_train = df[target_column_clf].iloc[:i]
|
115 |
-
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
116 |
-
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
117 |
-
|
118 |
-
# Fit the model to the training data
|
119 |
-
model2.fit(X_train, y_train)
|
120 |
-
|
121 |
-
# Make a prediction on the test data
|
122 |
-
predictions = model2.predict_proba(X_test)[:,-1]
|
123 |
-
|
124 |
-
# Create a DataFrame to store the true and predicted values
|
125 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
126 |
-
|
127 |
-
overall_results.append(result_df)
|
128 |
-
|
129 |
-
df_results = pd.concat(overall_results)
|
130 |
-
# model1.save_model('model_ensemble.bin')
|
131 |
-
# joblib.dump(model2, 'model2.bin')
|
132 |
-
# Return the true and predicted values, and fitted model
|
133 |
-
return df_results, model1, model2
|
134 |
-
|
135 |
-
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
136 |
-
regr_pred = trained_reg_model.predict(df)
|
137 |
-
regr_pred = regr_pred > 0
|
138 |
-
new_df = df.copy()
|
139 |
-
new_df['RegrModelOut'] = regr_pred
|
140 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
141 |
-
return clf_pred_proba
|
142 |
-
|
143 |
-
def get_data():
|
144 |
-
# f = open('settings.json')
|
145 |
-
# j = json.load(f)
|
146 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
147 |
-
|
148 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
149 |
-
|
150 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
151 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
152 |
-
r = requests.get(release_dates_url)
|
153 |
-
text = r.text
|
154 |
-
soup = BeautifulSoup(text, 'xml')
|
155 |
-
dates = []
|
156 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
157 |
-
dates.append(release_date_tag.text)
|
158 |
-
return dates
|
159 |
-
|
160 |
-
def parse_release_dates_obs(series_id: str) -> List[str]:
|
161 |
-
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
162 |
-
r = requests.get(obs_url)
|
163 |
-
text = r.text
|
164 |
-
soup = BeautifulSoup(text, 'xml')
|
165 |
-
observations = []
|
166 |
-
for observation_tag in soup.find_all('observation'):
|
167 |
-
date = observation_tag.get('date')
|
168 |
-
value = observation_tag.get('value')
|
169 |
-
observations.append((date, value))
|
170 |
-
return observations
|
171 |
-
|
172 |
-
econ_dfs = {}
|
173 |
-
|
174 |
-
econ_tickers = [
|
175 |
-
'WALCL',
|
176 |
-
'NFCI',
|
177 |
-
'WRESBAL'
|
178 |
-
]
|
179 |
-
|
180 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
181 |
-
# p = parse_release_dates_obs(et)
|
182 |
-
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
183 |
-
df = pdr.get_data_fred(et)
|
184 |
-
df.index = df.index.rename('ds')
|
185 |
-
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
186 |
-
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
187 |
-
econ_dfs[et] = df
|
188 |
-
|
189 |
-
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
190 |
-
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
191 |
-
|
192 |
-
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
193 |
-
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
194 |
-
|
195 |
-
release_ids = [
|
196 |
-
"10", # "Consumer Price Index"
|
197 |
-
"46", # "Producer Price Index"
|
198 |
-
"50", # "Employment Situation"
|
199 |
-
"53", # "Gross Domestic Product"
|
200 |
-
"103", # "Discount Rate Meeting Minutes"
|
201 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
202 |
-
"194", # "ADP National Employment Report"
|
203 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
204 |
-
]
|
205 |
-
|
206 |
-
release_names = [
|
207 |
-
"CPI",
|
208 |
-
"PPI",
|
209 |
-
"NFP",
|
210 |
-
"GDP",
|
211 |
-
"FOMC",
|
212 |
-
"UNEMP",
|
213 |
-
"ADP",
|
214 |
-
"PCE"
|
215 |
-
]
|
216 |
-
|
217 |
-
releases = {}
|
218 |
-
|
219 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
220 |
-
releases[rid] = {}
|
221 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
222 |
-
releases[rid]['name'] = n
|
223 |
-
|
224 |
-
# Create a DF that has all dates with the name of the col as 1
|
225 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
226 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
227 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
228 |
-
releases[rid]['df'] = pd.DataFrame(
|
229 |
-
index=releases[rid]['dates'],
|
230 |
-
data={
|
231 |
-
releases[rid]['name']: 1
|
232 |
-
})
|
233 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
234 |
-
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
235 |
-
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
236 |
-
|
237 |
-
vix = yf.Ticker('^VIX')
|
238 |
-
spx = yf.Ticker('^GSPC')
|
239 |
-
|
240 |
-
|
241 |
-
# Pull in data
|
242 |
-
data = load_dataset("boomsss/spx_intra", split='train')
|
243 |
-
|
244 |
-
rows = [d['text'] for d in data]
|
245 |
-
rows = [x.split(',') for x in rows]
|
246 |
-
|
247 |
-
fr = pd.DataFrame(columns=[
|
248 |
-
'Datetime','Open','High','Low','Close'
|
249 |
-
], data = rows)
|
250 |
-
|
251 |
-
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
252 |
-
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
253 |
-
fr = fr.set_index('Datetime')
|
254 |
-
fr['Open'] = pd.to_numeric(fr['Open'])
|
255 |
-
fr['High'] = pd.to_numeric(fr['High'])
|
256 |
-
fr['Low'] = pd.to_numeric(fr['Low'])
|
257 |
-
fr['Close'] = pd.to_numeric(fr['Close'])
|
258 |
-
|
259 |
-
# Get incremental date
|
260 |
-
last_date = fr.index.date[-1]
|
261 |
-
last_date = last_date + datetime.timedelta(days=1)
|
262 |
-
# Get incremental data
|
263 |
-
spx1 = yf.Ticker('^GSPC')
|
264 |
-
yfp = spx1.history(start=last_date, interval='30m')
|
265 |
-
|
266 |
-
if len(yfp) > 0:
|
267 |
-
# Concat current and incremental
|
268 |
-
df_30m = pd.concat([fr, yfp])
|
269 |
-
else:
|
270 |
-
df_30m = fr.copy()
|
271 |
-
|
272 |
-
# Get the first 30 minute bar
|
273 |
-
df_30m = df_30m.reset_index()
|
274 |
-
df_30m['Datetime'] = df_30m['Datetime'].dt.date
|
275 |
-
df_30m = df_30m.groupby('Datetime').head(3)
|
276 |
-
df_30m = df_30m.set_index('Datetime',drop=True)
|
277 |
-
# Rename the columns
|
278 |
-
df_30m = df_30m[['Open','High','Low','Close']]
|
279 |
-
|
280 |
-
opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
|
281 |
-
highs_1h = df_30m.groupby('Datetime')['High'].max()
|
282 |
-
lows_1h = df_30m.groupby('Datetime')['Low'].min()
|
283 |
-
closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
|
284 |
-
|
285 |
-
df_1h = pd.DataFrame(index=df_30m.index.unique())
|
286 |
-
df_1h['Open'] = opens_1h
|
287 |
-
df_1h['High'] = highs_1h
|
288 |
-
df_1h['Low'] = lows_1h
|
289 |
-
df_1h['Close'] = closes_1h
|
290 |
-
|
291 |
-
df_1h.columns = ['Open30','High30','Low30','Close30']
|
292 |
-
|
293 |
-
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
294 |
-
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
295 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
296 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
297 |
-
prices_spx.index = prices_spx['index']
|
298 |
-
prices_spx = prices_spx.drop(columns='index')
|
299 |
-
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
300 |
-
|
301 |
-
|
302 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
303 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
304 |
-
prices_vix.index = prices_vix['index']
|
305 |
-
prices_vix = prices_vix.drop(columns='index')
|
306 |
-
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
307 |
-
|
308 |
-
|
309 |
-
data = prices_spx.merge(df_1h, left_index=True, right_index=True)
|
310 |
-
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
311 |
-
|
312 |
-
# Features
|
313 |
-
data['PrevClose'] = data['Close'].shift(1)
|
314 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
315 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
316 |
-
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
317 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
318 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
319 |
-
|
320 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
321 |
-
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
322 |
-
|
323 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
324 |
-
data['RangePct'] = data['Range'] / data['Close']
|
325 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
326 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
327 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
328 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
329 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
330 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
331 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
332 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
333 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
334 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
335 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
336 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
337 |
-
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
338 |
-
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
339 |
-
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
340 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
341 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
342 |
-
|
343 |
-
# Intraday features
|
344 |
-
data['CurrentHigh30'] = data['High30'].shift(-1)
|
345 |
-
data['CurrentLow30'] = data['Low30'].shift(-1)
|
346 |
-
data['CurrentClose30'] = data['Close30'].shift(-1)
|
347 |
-
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
348 |
-
|
349 |
-
|
350 |
-
# Open to High
|
351 |
-
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
352 |
-
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
353 |
-
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
354 |
-
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
355 |
-
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
356 |
-
|
357 |
-
# Target -- the next day's low
|
358 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
359 |
-
data['Target'] = data['Target'].shift(-1)
|
360 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
361 |
-
|
362 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
363 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
364 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
365 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
366 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
367 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
368 |
-
|
369 |
-
# Calculate up
|
370 |
-
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
371 |
-
|
372 |
-
# Calculate upSD
|
373 |
-
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
374 |
-
|
375 |
-
# Calculate aveUp
|
376 |
-
data['aveUp'] = data['up'].rolling(30).mean()
|
377 |
-
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
378 |
-
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
379 |
-
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
380 |
-
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
381 |
-
data['aveDown'] = data['down'].rolling(30).mean()
|
382 |
-
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
383 |
-
data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
|
384 |
-
|
385 |
-
data = data.assign(
|
386 |
-
L1Touch = lambda x: x['Low'] < x['L1'],
|
387 |
-
L2Touch = lambda x: x['Low'] < x['L2'],
|
388 |
-
H1Touch = lambda x: x['High'] > x['H1'],
|
389 |
-
H2Touch = lambda x: x['High'] > x['H2'],
|
390 |
-
L1Break = lambda x: x['Close'] < x['L1'],
|
391 |
-
L2Break = lambda x: x['Close'] < x['L2'],
|
392 |
-
H1Break = lambda x: x['Close'] > x['H1'],
|
393 |
-
H2Break = lambda x: x['Close'] > x['H2'],
|
394 |
-
OpenL1 = lambda x: x['Open'] / x['L1'],
|
395 |
-
OpenL2 = lambda x: x['Open'] / x['L2'],
|
396 |
-
OpenH1 = lambda x: x['Open'] / x['H1'],
|
397 |
-
OpenH2 = lambda x: x['Open'] / x['H2']
|
398 |
-
)
|
399 |
-
|
400 |
-
level_cols = [
|
401 |
-
'L1Touch',
|
402 |
-
'L2Touch',
|
403 |
-
'H1Touch',
|
404 |
-
'H2Touch',
|
405 |
-
'L1Break',
|
406 |
-
'L2Break',
|
407 |
-
'H1Break',
|
408 |
-
'H2Break'
|
409 |
-
]
|
410 |
-
|
411 |
-
for col in level_cols:
|
412 |
-
data[col+'Pct'] = data[col].rolling(100).mean()
|
413 |
-
data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
414 |
-
|
415 |
-
def get_quintiles(df, col_name, q):
|
416 |
-
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
417 |
-
|
418 |
-
probas = []
|
419 |
-
for i, pct in enumerate(data['CurrentClose30toClose']):
|
420 |
-
try:
|
421 |
-
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
|
422 |
-
for q in df_q.index:
|
423 |
-
if q.left <= pct <= q.right:
|
424 |
-
p = df_q[q]
|
425 |
-
except:
|
426 |
-
p = None
|
427 |
-
|
428 |
-
probas.append(p)
|
429 |
-
|
430 |
-
# gapfills = []
|
431 |
-
# for i, pct in enumerate(data['CurrentGap']):
|
432 |
-
# try:
|
433 |
-
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
|
434 |
-
# for q in df_q.index:
|
435 |
-
# if q.left <= pct <= q.right:
|
436 |
-
# p = df_q[q]
|
437 |
-
# except:
|
438 |
-
# p = None
|
439 |
-
|
440 |
-
# gapfills.append(p)
|
441 |
-
|
442 |
-
data['GreenProbas'] = probas
|
443 |
-
# data['GapFillGreenProba'] = gapfills
|
444 |
-
|
445 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
446 |
-
# Get the name of the release
|
447 |
-
n = releases[rid]['name']
|
448 |
-
# Merge the corresponding DF of the release
|
449 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
450 |
-
# Create a column that shifts the value in the merged column up by 1
|
451 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
452 |
-
# Fill the rest with zeroes
|
453 |
-
data[n] = data[n].fillna(0)
|
454 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
455 |
-
|
456 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
457 |
-
|
458 |
-
def cumul_sum(col):
|
459 |
-
nums = []
|
460 |
-
s = 0
|
461 |
-
for x in col:
|
462 |
-
if x == 1:
|
463 |
-
s += 1
|
464 |
-
elif x == 0:
|
465 |
-
s = 0
|
466 |
-
nums.append(s)
|
467 |
-
return nums
|
468 |
-
|
469 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
470 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
471 |
-
|
472 |
-
data['DaysGreen'] = consec_green
|
473 |
-
data['DaysRed'] = consec_red
|
474 |
-
|
475 |
-
final_row = data.index[-2]
|
476 |
-
|
477 |
-
exp_row = data.index[-1]
|
478 |
-
|
479 |
-
df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
|
480 |
-
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
481 |
-
return data, df_final, final_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_day.py
DELETED
@@ -1,434 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import json
|
7 |
-
import requests
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
-
from typing import List
|
10 |
-
import xgboost as xgb
|
11 |
-
from tqdm import tqdm
|
12 |
-
from sklearn import linear_model
|
13 |
-
import joblib
|
14 |
-
import os
|
15 |
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
16 |
-
import datetime
|
17 |
-
from pandas.tseries.offsets import BDay
|
18 |
-
import lightgbm as lgb
|
19 |
-
|
20 |
-
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
21 |
-
|
22 |
-
# Create an XGBRegressor model
|
23 |
-
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
24 |
-
model = linear_model.LinearRegression()
|
25 |
-
|
26 |
-
overall_results = []
|
27 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
28 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
29 |
-
# Split the data into training and test sets
|
30 |
-
X_train = df.drop(target_column, axis=1).iloc[:i]
|
31 |
-
y_train = df[target_column].iloc[:i]
|
32 |
-
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
33 |
-
y_test = df[target_column].iloc[i:i+num_periods]
|
34 |
-
|
35 |
-
# Fit the model to the training data
|
36 |
-
model.fit(X_train, y_train)
|
37 |
-
|
38 |
-
# Make a prediction on the test data
|
39 |
-
predictions = model.predict(X_test)
|
40 |
-
|
41 |
-
# Create a DataFrame to store the true and predicted values
|
42 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
43 |
-
|
44 |
-
overall_results.append(result_df)
|
45 |
-
|
46 |
-
df_results = pd.concat(overall_results)
|
47 |
-
# model.save_model('model_lr.bin')
|
48 |
-
# Return the true and predicted values, and fitted model
|
49 |
-
return df_results, model
|
50 |
-
|
51 |
-
model_cols = [
|
52 |
-
'BigNewsDay',
|
53 |
-
'Quarter',
|
54 |
-
'Perf5Day',
|
55 |
-
'Perf5Day_n1',
|
56 |
-
'DaysGreen',
|
57 |
-
'DaysRed',
|
58 |
-
'CurrentGap',
|
59 |
-
'RangePct',
|
60 |
-
'RangePct_n1',
|
61 |
-
'RangePct_n2',
|
62 |
-
'OHLC4_VIX',
|
63 |
-
'OHLC4_VIX_n1',
|
64 |
-
'OHLC4_VIX_n2',
|
65 |
-
'VIXOpen',
|
66 |
-
'VVIXOpen',
|
67 |
-
'OpenL1',
|
68 |
-
'OpenL2',
|
69 |
-
'OpenH1',
|
70 |
-
'OpenH2',
|
71 |
-
'L1TouchPct',
|
72 |
-
'L2TouchPct',
|
73 |
-
'H1TouchPct',
|
74 |
-
'H2TouchPct',
|
75 |
-
'L1BreakPct',
|
76 |
-
'L2BreakPct',
|
77 |
-
'H1BreakPct',
|
78 |
-
'H2BreakPct',
|
79 |
-
'H1BreakTouchPct',
|
80 |
-
'H2BreakTouchPct',
|
81 |
-
'L1BreakTouchPct',
|
82 |
-
'L2BreakTouchPct'
|
83 |
-
]
|
84 |
-
|
85 |
-
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
86 |
-
|
87 |
-
# Create run the regression model to get its target
|
88 |
-
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
89 |
-
# joblib.dump(model1, 'model1.bin')
|
90 |
-
|
91 |
-
# Merge the result df back on the df for feeding into the classifier
|
92 |
-
for_merge = res[['Predicted']]
|
93 |
-
for_merge.columns = ['RegrModelOut']
|
94 |
-
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
95 |
-
df = df.merge(for_merge, left_index=True, right_index=True)
|
96 |
-
df = df.drop(columns=[target_column_regr])
|
97 |
-
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
98 |
-
|
99 |
-
df[target_column_clf] = df[target_column_clf].astype(bool)
|
100 |
-
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
101 |
-
|
102 |
-
# Create an XGBRegressor model
|
103 |
-
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
104 |
-
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
105 |
-
# model = linear_model.LogisticRegression(max_iter=1500)
|
106 |
-
|
107 |
-
overall_results = []
|
108 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
109 |
-
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
110 |
-
# Split the data into training and test sets
|
111 |
-
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
112 |
-
y_train = df[target_column_clf].iloc[:i]
|
113 |
-
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
114 |
-
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
115 |
-
|
116 |
-
# Fit the model to the training data
|
117 |
-
model2.fit(X_train, y_train)
|
118 |
-
|
119 |
-
# Make a prediction on the test data
|
120 |
-
predictions = model2.predict_proba(X_test)[:,-1]
|
121 |
-
|
122 |
-
# Create a DataFrame to store the true and predicted values
|
123 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
124 |
-
|
125 |
-
overall_results.append(result_df)
|
126 |
-
|
127 |
-
df_results = pd.concat(overall_results)
|
128 |
-
|
129 |
-
# Calibrate Probabilities
|
130 |
-
def get_quantiles(df, col_name, q):
|
131 |
-
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
132 |
-
|
133 |
-
greenprobas = []
|
134 |
-
meanprobas = []
|
135 |
-
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
|
136 |
-
try:
|
137 |
-
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
138 |
-
for q in df_q.index:
|
139 |
-
if q.left <= pct <= q.right:
|
140 |
-
p = df_q[q]
|
141 |
-
c = (q.left + q.right) / 2
|
142 |
-
except:
|
143 |
-
p = None
|
144 |
-
c = None
|
145 |
-
|
146 |
-
greenprobas.append(p)
|
147 |
-
meanprobas.append(c)
|
148 |
-
|
149 |
-
df_results['CalibPredicted'] = greenprobas
|
150 |
-
|
151 |
-
return df_results, model1, model2
|
152 |
-
|
153 |
-
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
154 |
-
regr_pred = trained_reg_model.predict(df)
|
155 |
-
regr_pred = regr_pred > 0
|
156 |
-
new_df = df.copy()
|
157 |
-
new_df['RegrModelOut'] = regr_pred
|
158 |
-
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
159 |
-
return clf_pred_proba
|
160 |
-
|
161 |
-
def get_data():
|
162 |
-
# f = open('settings.json')
|
163 |
-
# j = json.load(f)
|
164 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
165 |
-
|
166 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
167 |
-
|
168 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
169 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
170 |
-
r = requests.get(release_dates_url)
|
171 |
-
text = r.text
|
172 |
-
soup = BeautifulSoup(text, 'xml')
|
173 |
-
dates = []
|
174 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
175 |
-
dates.append(release_date_tag.text)
|
176 |
-
return dates
|
177 |
-
|
178 |
-
def parse_release_dates_obs(series_id: str) -> List[str]:
|
179 |
-
obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
180 |
-
r = requests.get(obs_url)
|
181 |
-
text = r.text
|
182 |
-
soup = BeautifulSoup(text, 'xml')
|
183 |
-
observations = []
|
184 |
-
for observation_tag in soup.find_all('observation'):
|
185 |
-
date = observation_tag.get('date')
|
186 |
-
value = observation_tag.get('value')
|
187 |
-
observations.append((date, value))
|
188 |
-
return observations
|
189 |
-
|
190 |
-
econ_dfs = {}
|
191 |
-
|
192 |
-
econ_tickers = [
|
193 |
-
'WALCL',
|
194 |
-
'NFCI',
|
195 |
-
'WRESBAL'
|
196 |
-
]
|
197 |
-
|
198 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
199 |
-
# p = parse_release_dates_obs(et)
|
200 |
-
# df = pd.DataFrame(columns = ['ds',et], data = p)
|
201 |
-
df = pdr.get_data_fred(et)
|
202 |
-
df.index = df.index.rename('ds')
|
203 |
-
# df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
|
204 |
-
# df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
|
205 |
-
econ_dfs[et] = df
|
206 |
-
|
207 |
-
# walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
|
208 |
-
# walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
|
209 |
-
|
210 |
-
# nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
|
211 |
-
# nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
|
212 |
-
|
213 |
-
release_ids = [
|
214 |
-
"10", # "Consumer Price Index"
|
215 |
-
"46", # "Producer Price Index"
|
216 |
-
"50", # "Employment Situation"
|
217 |
-
"53", # "Gross Domestic Product"
|
218 |
-
"103", # "Discount Rate Meeting Minutes"
|
219 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
220 |
-
"194", # "ADP National Employment Report"
|
221 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
222 |
-
]
|
223 |
-
|
224 |
-
release_names = [
|
225 |
-
"CPI",
|
226 |
-
"PPI",
|
227 |
-
"NFP",
|
228 |
-
"GDP",
|
229 |
-
"FOMC",
|
230 |
-
"UNEMP",
|
231 |
-
"ADP",
|
232 |
-
"PCE"
|
233 |
-
]
|
234 |
-
|
235 |
-
releases = {}
|
236 |
-
|
237 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
238 |
-
releases[rid] = {}
|
239 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
240 |
-
releases[rid]['name'] = n
|
241 |
-
|
242 |
-
# Create a DF that has all dates with the name of the col as 1
|
243 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
244 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
245 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
246 |
-
releases[rid]['df'] = pd.DataFrame(
|
247 |
-
index=releases[rid]['dates'],
|
248 |
-
data={
|
249 |
-
releases[rid]['name']: 1
|
250 |
-
})
|
251 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
252 |
-
# releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
|
253 |
-
# releases[rid]['df'] = releases[rid]['df'].set_index('ds')
|
254 |
-
|
255 |
-
vix = yf.Ticker('^VIX')
|
256 |
-
vvix = yf.Ticker('^VVIX')
|
257 |
-
spx = yf.Ticker('^GSPC')
|
258 |
-
|
259 |
-
prices_vix = vix.history(start='2018-07-01', interval='1d')
|
260 |
-
prices_spx = spx.history(start='2018-07-01', interval='1d')
|
261 |
-
prices_vvix = vvix.history(start='2018-07-01', interval='1d')
|
262 |
-
|
263 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
264 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
265 |
-
prices_spx.index = prices_spx['index']
|
266 |
-
prices_spx = prices_spx.drop(columns='index')
|
267 |
-
|
268 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
269 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
270 |
-
prices_vix.index = prices_vix['index']
|
271 |
-
prices_vix = prices_vix.drop(columns='index')
|
272 |
-
|
273 |
-
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
|
274 |
-
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
|
275 |
-
prices_vvix.index = prices_vvix['index']
|
276 |
-
prices_vvix = prices_vvix.drop(columns='index')
|
277 |
-
|
278 |
-
data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
279 |
-
data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
280 |
-
data.index = pd.DatetimeIndex(data.index)
|
281 |
-
|
282 |
-
# Features
|
283 |
-
data['PrevClose'] = data['Close'].shift(1)
|
284 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
285 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1).astype(bool)
|
286 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
287 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
288 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
289 |
-
data['VIX5Day_n1'] = data['VIX5Day'].shift(1).astype(bool)
|
290 |
-
data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1)
|
291 |
-
data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1)
|
292 |
-
data['VIXOpen'] = data['VIXOpen'].astype(bool)
|
293 |
-
data['VVIXOpen'] = data['VVIXOpen'].astype(bool)
|
294 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1)
|
295 |
-
data['RangePct'] = data['Range'] / data['Close']
|
296 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
297 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
298 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
299 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
300 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1).astype(float)
|
301 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(2).astype(float)
|
302 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
303 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
304 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
305 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
306 |
-
data['CurrentGap'] = ((data['Open'] - data['PrevClose']) / data['PrevClose']).shift(-1)
|
307 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
308 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
309 |
-
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
310 |
-
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
311 |
-
data['aveUp'] = data['up'].rolling(30).mean()
|
312 |
-
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
313 |
-
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
314 |
-
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
315 |
-
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
316 |
-
data['aveDown'] = data['down'].rolling(30).mean()
|
317 |
-
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
318 |
-
data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
|
319 |
-
data['L1Touch'] = data['Low'] < data['L1']
|
320 |
-
data['L2Touch'] = data['Low'] < data['L2']
|
321 |
-
data['H1Touch'] = data['High'] > data['H1']
|
322 |
-
data['H2Touch'] = data['High'] > data['H2']
|
323 |
-
data['L1Break'] = data['Close'] < data['L1']
|
324 |
-
data['L2Break'] = data['Close'] < data['L2']
|
325 |
-
data['H1Break'] = data['Close'] > data['H1']
|
326 |
-
data['H2Break'] = data['Close'] > data['H2']
|
327 |
-
data['OpenL1'] = data['Open'] / data['L1']
|
328 |
-
data['OpenL2'] = data['Open'] / data['L2']
|
329 |
-
data['OpenH1'] = data['Open'] / data['H1']
|
330 |
-
data['OpenH2'] = data['Open'] / data['H2']
|
331 |
-
|
332 |
-
level_cols = [
|
333 |
-
'L1Touch',
|
334 |
-
'L2Touch',
|
335 |
-
'H1Touch',
|
336 |
-
'H2Touch',
|
337 |
-
'L1Break',
|
338 |
-
'L2Break',
|
339 |
-
'H1Break',
|
340 |
-
'H2Break'
|
341 |
-
]
|
342 |
-
|
343 |
-
for col in level_cols:
|
344 |
-
data[col+'Pct'] = data[col].rolling(100).mean()
|
345 |
-
|
346 |
-
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
347 |
-
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
348 |
-
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
349 |
-
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
350 |
-
|
351 |
-
# Target -- the next day's low
|
352 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
353 |
-
data['Target'] = data['Target'].shift(-1)
|
354 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
355 |
-
|
356 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
357 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
358 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
359 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
360 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
361 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
362 |
-
|
363 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
364 |
-
# Get the name of the release
|
365 |
-
n = releases[rid]['name']
|
366 |
-
# Merge the corresponding DF of the release
|
367 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
368 |
-
# Create a column that shifts the value in the merged column up by 1
|
369 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
370 |
-
# Fill the rest with zeroes
|
371 |
-
data[n] = data[n].fillna(0)
|
372 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
373 |
-
|
374 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
375 |
-
|
376 |
-
def cumul_sum(col):
|
377 |
-
nums = []
|
378 |
-
s = 0
|
379 |
-
for x in col:
|
380 |
-
if x == 1:
|
381 |
-
s += 1
|
382 |
-
elif x == 0:
|
383 |
-
s = 0
|
384 |
-
nums.append(s)
|
385 |
-
return nums
|
386 |
-
|
387 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
388 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
389 |
-
|
390 |
-
data['DaysGreen'] = consec_green
|
391 |
-
data['DaysRed'] = consec_red
|
392 |
-
|
393 |
-
final_row = data.index[-2]
|
394 |
-
|
395 |
-
exp_row = data.index[-1]
|
396 |
-
|
397 |
-
df_final = data.loc[:final_row,
|
398 |
-
[
|
399 |
-
'BigNewsDay',
|
400 |
-
'Quarter',
|
401 |
-
'Perf5Day',
|
402 |
-
'Perf5Day_n1',
|
403 |
-
'DaysGreen',
|
404 |
-
'DaysRed',
|
405 |
-
'CurrentGap',
|
406 |
-
'RangePct',
|
407 |
-
'RangePct_n1',
|
408 |
-
'RangePct_n2',
|
409 |
-
'OHLC4_VIX',
|
410 |
-
'OHLC4_VIX_n1',
|
411 |
-
'OHLC4_VIX_n2',
|
412 |
-
'VIXOpen',
|
413 |
-
'VVIXOpen',
|
414 |
-
'OpenL1',
|
415 |
-
'OpenL2',
|
416 |
-
'OpenH1',
|
417 |
-
'OpenH2',
|
418 |
-
'L1TouchPct',
|
419 |
-
'L2TouchPct',
|
420 |
-
'H1TouchPct',
|
421 |
-
'H2TouchPct',
|
422 |
-
'L1BreakPct',
|
423 |
-
'L2BreakPct',
|
424 |
-
'H1BreakPct',
|
425 |
-
'H2BreakPct',
|
426 |
-
'H1BreakTouchPct',
|
427 |
-
'H2BreakTouchPct',
|
428 |
-
'L1BreakTouchPct',
|
429 |
-
'L2BreakTouchPct',
|
430 |
-
'Target',
|
431 |
-
'Target_clf'
|
432 |
-
]]
|
433 |
-
df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
434 |
-
return data, df_final, final_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_intra.py
DELETED
@@ -1,531 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import pandas_datareader as pdr
|
4 |
-
import numpy as np
|
5 |
-
import yfinance as yf
|
6 |
-
import requests
|
7 |
-
from bs4 import BeautifulSoup
|
8 |
-
from typing import List
|
9 |
-
from tqdm import tqdm
|
10 |
-
import os
|
11 |
-
import datetime
|
12 |
-
from pandas.tseries.offsets import BDay
|
13 |
-
from datasets import load_dataset
|
14 |
-
import lightgbm as lgb
|
15 |
-
from sklearn.model_selection import TimeSeriesSplit
|
16 |
-
import json
|
17 |
-
|
18 |
-
data_start_date = '2018-07-01'
|
19 |
-
|
20 |
-
model_cols = [
|
21 |
-
'BigNewsDay',
|
22 |
-
'Quarter',
|
23 |
-
'Perf5Day',
|
24 |
-
'Perf5Day_n1',
|
25 |
-
'DaysGreen',
|
26 |
-
'DaysRed',
|
27 |
-
'CurrentHigh30toClose',
|
28 |
-
'CurrentLow30toClose',
|
29 |
-
'CurrentClose30toClose',
|
30 |
-
'CurrentRange30',
|
31 |
-
'GapFill30',
|
32 |
-
'CurrentGap',
|
33 |
-
'RangePct',
|
34 |
-
'RangePct_n1',
|
35 |
-
'RangePct_n2',
|
36 |
-
'OHLC4_VIX',
|
37 |
-
'OHLC4_VIX_n1',
|
38 |
-
'OHLC4_VIX_n2',
|
39 |
-
'OHLC4_Current_Trend',
|
40 |
-
'OHLC4_Trend',
|
41 |
-
'CurrentVIXTrend',
|
42 |
-
'SPX30IntraPerf',
|
43 |
-
'VIX30IntraPerf',
|
44 |
-
'VVIX30IntraPerf',
|
45 |
-
# 'OpenL1',
|
46 |
-
# 'OpenL2',
|
47 |
-
# 'OpenH1',
|
48 |
-
# 'OpenH2',
|
49 |
-
'L1TouchPct',
|
50 |
-
'L2TouchPct',
|
51 |
-
'H1TouchPct',
|
52 |
-
'H2TouchPct',
|
53 |
-
'L1BreakPct',
|
54 |
-
'L2BreakPct',
|
55 |
-
'H1BreakPct',
|
56 |
-
'H2BreakPct',
|
57 |
-
'GreenProbas',
|
58 |
-
'H1BreakTouchPct',
|
59 |
-
'H2BreakTouchPct',
|
60 |
-
'L1BreakTouchPct',
|
61 |
-
'L2BreakTouchPct',
|
62 |
-
'H1BreakH2TouchPct',
|
63 |
-
'L1BreakL2TouchPct',
|
64 |
-
'H1TouchGreenPct',
|
65 |
-
'L1TouchRedPct'
|
66 |
-
# 'GapFillGreenProba'
|
67 |
-
]
|
68 |
-
|
69 |
-
# If the dataset is gated/private, make sure you have run huggingface-cli login
|
70 |
-
def walk_forward_validation(df, target_column, num_periods):
|
71 |
-
|
72 |
-
df = df[model_cols + [target_column]]
|
73 |
-
df[target_column] = df[target_column].astype(bool)
|
74 |
-
|
75 |
-
# Model
|
76 |
-
# model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
77 |
-
|
78 |
-
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
79 |
-
|
80 |
-
overall_results = []
|
81 |
-
# Iterate over the rows in the DataFrame, one step at a time
|
82 |
-
# Split the time series data using TimeSeriesSplit
|
83 |
-
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
|
84 |
-
# Extract the training and testing data for the current split
|
85 |
-
X_train = df.drop(target_column, axis=1).iloc[train_index]
|
86 |
-
y_train = df[target_column].iloc[train_index]
|
87 |
-
X_test = df.drop(target_column, axis=1).iloc[test_index]
|
88 |
-
y_test = df[target_column].iloc[test_index]
|
89 |
-
|
90 |
-
y_train = y_train.astype(bool)
|
91 |
-
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
92 |
-
model.fit(X_train, y_train)
|
93 |
-
# Make a prediction on the test data
|
94 |
-
predictions = model.predict_proba(X_test)[:,-1]
|
95 |
-
|
96 |
-
# Create a DataFrame to store the true and predicted values
|
97 |
-
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
98 |
-
overall_results.append(result_df)
|
99 |
-
|
100 |
-
df_results = pd.concat(overall_results)
|
101 |
-
|
102 |
-
# Calibrate Probabilities
|
103 |
-
def get_quantiles(df, col_name, q):
|
104 |
-
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
105 |
-
|
106 |
-
greenprobas = []
|
107 |
-
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
|
108 |
-
try:
|
109 |
-
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
110 |
-
for q in df_q.index:
|
111 |
-
if q.left <= pct <= q.right:
|
112 |
-
p = df_q[q]
|
113 |
-
except:
|
114 |
-
p = None
|
115 |
-
|
116 |
-
greenprobas.append(p)
|
117 |
-
|
118 |
-
df_results['CalibPredicted'] = greenprobas
|
119 |
-
|
120 |
-
return df_results, model
|
121 |
-
|
122 |
-
def seq_predict_proba(df, trained_clf_model):
|
123 |
-
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
124 |
-
return clf_pred_proba
|
125 |
-
|
126 |
-
def get_data(periods_30m = 1):
|
127 |
-
# f = open('settings.json')
|
128 |
-
# j = json.load(f)
|
129 |
-
# API_KEY_FRED = j["API_KEY_FRED"]
|
130 |
-
|
131 |
-
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
132 |
-
|
133 |
-
def parse_release_dates(release_id: str) -> List[str]:
|
134 |
-
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
135 |
-
r = requests.get(release_dates_url)
|
136 |
-
text = r.text
|
137 |
-
soup = BeautifulSoup(text, 'xml')
|
138 |
-
dates = []
|
139 |
-
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
140 |
-
dates.append(release_date_tag.text)
|
141 |
-
return dates
|
142 |
-
|
143 |
-
econ_dfs = {}
|
144 |
-
|
145 |
-
econ_tickers = [
|
146 |
-
'WALCL',
|
147 |
-
'NFCI',
|
148 |
-
'WRESBAL'
|
149 |
-
]
|
150 |
-
|
151 |
-
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
152 |
-
df = pdr.get_data_fred(et)
|
153 |
-
df.index = df.index.rename('ds')
|
154 |
-
econ_dfs[et] = df
|
155 |
-
|
156 |
-
release_ids = [
|
157 |
-
"10", # "Consumer Price Index"
|
158 |
-
"46", # "Producer Price Index"
|
159 |
-
"50", # "Employment Situation"
|
160 |
-
"53", # "Gross Domestic Product"
|
161 |
-
"103", # "Discount Rate Meeting Minutes"
|
162 |
-
"180", # "Unemployment Insurance Weekly Claims Report"
|
163 |
-
"194", # "ADP National Employment Report"
|
164 |
-
"323" # "Trimmed Mean PCE Inflation Rate"
|
165 |
-
]
|
166 |
-
|
167 |
-
release_names = [
|
168 |
-
"CPI",
|
169 |
-
"PPI",
|
170 |
-
"NFP",
|
171 |
-
"GDP",
|
172 |
-
"FOMC",
|
173 |
-
"UNEMP",
|
174 |
-
"ADP",
|
175 |
-
"PCE"
|
176 |
-
]
|
177 |
-
|
178 |
-
releases = {}
|
179 |
-
|
180 |
-
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
181 |
-
releases[rid] = {}
|
182 |
-
releases[rid]['dates'] = parse_release_dates(rid)
|
183 |
-
releases[rid]['name'] = n
|
184 |
-
|
185 |
-
# Create a DF that has all dates with the name of the col as 1
|
186 |
-
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
187 |
-
# This column serves as the true/false indicator of whether there was economic data released that day.
|
188 |
-
for rid in tqdm(release_ids, desc='Making indicators'):
|
189 |
-
releases[rid]['df'] = pd.DataFrame(
|
190 |
-
index=releases[rid]['dates'],
|
191 |
-
data={
|
192 |
-
releases[rid]['name']: 1
|
193 |
-
})
|
194 |
-
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
195 |
-
|
196 |
-
vix = yf.Ticker('^VIX')
|
197 |
-
vvix = yf.Ticker('^VVIX')
|
198 |
-
spx = yf.Ticker('^GSPC')
|
199 |
-
|
200 |
-
# Pull in data
|
201 |
-
data_files = {"spx": "SPX_full_30min.txt", "vix": "VIX_full_30min.txt", "vvix":'VVIX_full_30min.txt'}
|
202 |
-
data = load_dataset("boomsss/spx_intra", data_files=data_files)
|
203 |
-
dfs = []
|
204 |
-
for ticker in data.keys():
|
205 |
-
rows = [d['text'] for d in data[ticker]]
|
206 |
-
rows = [x.split(',') for x in rows]
|
207 |
-
|
208 |
-
fr = pd.DataFrame(columns=[
|
209 |
-
'Datetime','Open','High','Low','Close'
|
210 |
-
], data = rows)
|
211 |
-
|
212 |
-
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
213 |
-
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
214 |
-
fr = fr.set_index('Datetime')
|
215 |
-
fr['Open'] = pd.to_numeric(fr['Open'])
|
216 |
-
fr['High'] = pd.to_numeric(fr['High'])
|
217 |
-
fr['Low'] = pd.to_numeric(fr['Low'])
|
218 |
-
fr['Close'] = pd.to_numeric(fr['Close'])
|
219 |
-
dfs.append(fr)
|
220 |
-
|
221 |
-
df_30m = pd.concat(dfs, axis=1)
|
222 |
-
|
223 |
-
df_30m.columns = [
|
224 |
-
'Open30',
|
225 |
-
'High30',
|
226 |
-
'Low30',
|
227 |
-
'Close30',
|
228 |
-
'Open_VIX30',
|
229 |
-
'High_VIX30',
|
230 |
-
'Low_VIX30',
|
231 |
-
'Close_VIX30',
|
232 |
-
'Open_VVIX30',
|
233 |
-
'High_VVIX30',
|
234 |
-
'Low_VVIX30',
|
235 |
-
'Close_VVIX30'
|
236 |
-
]
|
237 |
-
|
238 |
-
# Get incremental date
|
239 |
-
last_date = df_30m.index.date[-1]
|
240 |
-
last_date = last_date + datetime.timedelta(days=1)
|
241 |
-
|
242 |
-
# Get incremental data for each index
|
243 |
-
spx1 = yf.Ticker('^GSPC')
|
244 |
-
vix1 = yf.Ticker('^VIX')
|
245 |
-
vvix1 = yf.Ticker('^VVIX')
|
246 |
-
yfp = spx1.history(start=last_date, interval='30m')
|
247 |
-
yf_vix = vix1.history(start=last_date, interval='30m')
|
248 |
-
yf_vvix = vvix1.history(start=last_date, interval='30m')
|
249 |
-
|
250 |
-
if len(yfp) > 0:
|
251 |
-
# Convert indexes to EST if not already
|
252 |
-
for _df in [yfp, yf_vix, yf_vvix]:
|
253 |
-
if _df.index.tz.zone != 'America/New_York':
|
254 |
-
_df['Datetime'] = pd.to_datetime(_df.index)
|
255 |
-
_df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
|
256 |
-
_df.set_index('Datetime', inplace=True)
|
257 |
-
# Concat them
|
258 |
-
df_inc = pd.concat([
|
259 |
-
yfp[['Open','High','Low','Close']],
|
260 |
-
yf_vix[['Open','High','Low','Close']],
|
261 |
-
yf_vvix[['Open','High','Low','Close']]
|
262 |
-
], axis=1)
|
263 |
-
df_inc.columns = df_30m.columns
|
264 |
-
df_inc = df_inc.loc[
|
265 |
-
(df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
|
266 |
-
]
|
267 |
-
df_30m = pd.concat([df_30m, df_inc])
|
268 |
-
else:
|
269 |
-
df_30m = df_30m.copy()
|
270 |
-
|
271 |
-
df_30m = df_30m.loc[
|
272 |
-
(df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
|
273 |
-
]
|
274 |
-
df_30m['dt'] = df_30m.index.date
|
275 |
-
df_30m = df_30m.groupby('dt').head(periods_30m)
|
276 |
-
df_30m = df_30m.set_index('dt',drop=True)
|
277 |
-
df_30m.index.name = 'Datetime'
|
278 |
-
|
279 |
-
df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
|
280 |
-
df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
|
281 |
-
df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
|
282 |
-
|
283 |
-
opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
|
284 |
-
highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
|
285 |
-
lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
|
286 |
-
closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
|
287 |
-
spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
|
288 |
-
vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
|
289 |
-
vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
|
290 |
-
|
291 |
-
df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
|
292 |
-
|
293 |
-
|
294 |
-
prices_vix = vix.history(start=data_start_date, interval='1d')
|
295 |
-
prices_vvix = vvix.history(start=data_start_date, interval='1d')
|
296 |
-
prices_spx = spx.history(start=data_start_date, interval='1d')
|
297 |
-
|
298 |
-
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
299 |
-
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
300 |
-
prices_spx.index = prices_spx['index']
|
301 |
-
prices_spx = prices_spx.drop(columns='index')
|
302 |
-
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
303 |
-
|
304 |
-
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
305 |
-
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
306 |
-
prices_vix.index = prices_vix['index']
|
307 |
-
prices_vix = prices_vix.drop(columns='index')
|
308 |
-
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
309 |
-
|
310 |
-
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
|
311 |
-
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
|
312 |
-
prices_vvix.index = prices_vvix['index']
|
313 |
-
prices_vvix = prices_vvix.drop(columns='index')
|
314 |
-
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
|
315 |
-
|
316 |
-
data = prices_spx.merge(df_intra, left_index=True, right_index=True)
|
317 |
-
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
318 |
-
data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
319 |
-
|
320 |
-
# Features
|
321 |
-
data['PrevClose'] = data['Close'].shift(1)
|
322 |
-
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
323 |
-
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
324 |
-
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
325 |
-
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
326 |
-
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
327 |
-
|
328 |
-
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
329 |
-
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
330 |
-
|
331 |
-
data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
|
332 |
-
data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
|
333 |
-
|
334 |
-
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
335 |
-
data['RangePct'] = data['Range'] / data['Close']
|
336 |
-
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
337 |
-
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
338 |
-
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
339 |
-
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
340 |
-
data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
|
341 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
342 |
-
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
343 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
344 |
-
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
345 |
-
data['RangePct_n1'] = data['RangePct'].shift(1)
|
346 |
-
data['RangePct_n2'] = data['RangePct'].shift(2)
|
347 |
-
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
348 |
-
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
349 |
-
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
350 |
-
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
351 |
-
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
352 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
353 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
354 |
-
|
355 |
-
# Intraday features
|
356 |
-
data['CurrentOpen30'] = data['Open30'].shift(-1)
|
357 |
-
data['CurrentHigh30'] = data['High30'].shift(-1)
|
358 |
-
data['CurrentLow30'] = data['Low30'].shift(-1)
|
359 |
-
data['CurrentClose30'] = data['Close30'].shift(-1)
|
360 |
-
data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
|
361 |
-
data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
|
362 |
-
data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
|
363 |
-
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
364 |
-
|
365 |
-
data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
|
366 |
-
data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
|
367 |
-
|
368 |
-
data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
|
369 |
-
|
370 |
-
# Open to High
|
371 |
-
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
372 |
-
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
373 |
-
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
374 |
-
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
375 |
-
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
376 |
-
|
377 |
-
# Target -- the next day's low
|
378 |
-
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
379 |
-
data['Target'] = data['Target'].shift(-1)
|
380 |
-
# data['Target'] = data['RangePct'].shift(-1)
|
381 |
-
|
382 |
-
# Target for clf -- whether tomorrow will close above or below today's close
|
383 |
-
data['Target_clf'] = data['Close'] > data['PrevClose']
|
384 |
-
data['Target_clf'] = data['Target_clf'].shift(-1)
|
385 |
-
data['DayOfWeek'] = pd.to_datetime(data.index)
|
386 |
-
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
387 |
-
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
388 |
-
|
389 |
-
# Calculate up
|
390 |
-
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
391 |
-
|
392 |
-
# Calculate upSD
|
393 |
-
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
394 |
-
|
395 |
-
# Calculate aveUp
|
396 |
-
data['aveUp'] = data['up'].rolling(30).mean()
|
397 |
-
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
398 |
-
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
399 |
-
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
400 |
-
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
401 |
-
data['aveDown'] = data['down'].rolling(30).mean()
|
402 |
-
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
403 |
-
data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
|
404 |
-
|
405 |
-
data = data.assign(
|
406 |
-
L1Touch = lambda x: x['Low'] < x['L1'],
|
407 |
-
L2Touch = lambda x: x['Low'] < x['L2'],
|
408 |
-
H1Touch = lambda x: x['High'] > x['H1'],
|
409 |
-
H2Touch = lambda x: x['High'] > x['H2'],
|
410 |
-
L1Break = lambda x: x['Close'] < x['L1'],
|
411 |
-
L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
|
412 |
-
L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
|
413 |
-
L2Break = lambda x: x['Close'] < x['L2'],
|
414 |
-
H1Break = lambda x: x['Close'] > x['H1'],
|
415 |
-
H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
|
416 |
-
H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
|
417 |
-
H2Break = lambda x: x['Close'] > x['H2'],
|
418 |
-
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
|
419 |
-
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
420 |
-
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
421 |
-
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
|
422 |
-
CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
|
423 |
-
CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
|
424 |
-
CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
|
425 |
-
CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
|
426 |
-
)
|
427 |
-
|
428 |
-
data['OpenL1'] = data['OpenL1'].shift(-1)
|
429 |
-
data['OpenL2'] = data['OpenL2'].shift(-1)
|
430 |
-
data['OpenH1'] = data['OpenH1'].shift(-1)
|
431 |
-
data['OpenH2'] = data['OpenH2'].shift(-1)
|
432 |
-
data['CloseL1'] = data['CloseL1'].shift(-1)
|
433 |
-
data['CloseL2'] = data['CloseL2'].shift(-1)
|
434 |
-
data['CloseH1'] = data['CloseH1'].shift(-1)
|
435 |
-
data['CloseH2'] = data['CloseH2'].shift(-1)
|
436 |
-
|
437 |
-
level_cols = [
|
438 |
-
'L1Touch',
|
439 |
-
'L2Touch',
|
440 |
-
'H1Touch',
|
441 |
-
'H2Touch',
|
442 |
-
'L1Break',
|
443 |
-
'L2Break',
|
444 |
-
'H1Break',
|
445 |
-
'H2Break'
|
446 |
-
]
|
447 |
-
|
448 |
-
for col in level_cols:
|
449 |
-
data[col+'Pct'] = data[col].rolling(100).mean()
|
450 |
-
# data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
451 |
-
|
452 |
-
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
453 |
-
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
454 |
-
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
455 |
-
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
456 |
-
data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
457 |
-
data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
458 |
-
|
459 |
-
data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
460 |
-
data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
461 |
-
|
462 |
-
def get_quintiles(df, col_name, q):
|
463 |
-
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
464 |
-
|
465 |
-
probas = []
|
466 |
-
# Given the current price level
|
467 |
-
for i, pct in enumerate(data['CurrentClose30toClose']):
|
468 |
-
try:
|
469 |
-
# Split
|
470 |
-
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
|
471 |
-
for q in df_q.index:
|
472 |
-
if q.left <= pct <= q.right:
|
473 |
-
p = df_q[q]
|
474 |
-
except:
|
475 |
-
p = None
|
476 |
-
|
477 |
-
probas.append(p)
|
478 |
-
|
479 |
-
# gapfills = []
|
480 |
-
# for i, pct in enumerate(data['CurrentGap']):
|
481 |
-
# try:
|
482 |
-
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
|
483 |
-
# for q in df_q.index:
|
484 |
-
# if q.left <= pct <= q.right:
|
485 |
-
# p = df_q[q]
|
486 |
-
# except:
|
487 |
-
# p = None
|
488 |
-
|
489 |
-
# gapfills.append(p)
|
490 |
-
|
491 |
-
data['GreenProbas'] = probas
|
492 |
-
# data['GapFillGreenProba'] = gapfills
|
493 |
-
|
494 |
-
for rid in tqdm(release_ids, desc='Merging econ data'):
|
495 |
-
# Get the name of the release
|
496 |
-
n = releases[rid]['name']
|
497 |
-
# Merge the corresponding DF of the release
|
498 |
-
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
499 |
-
# Create a column that shifts the value in the merged column up by 1
|
500 |
-
data[f'{n}_shift'] = data[n].shift(-1)
|
501 |
-
# Fill the rest with zeroes
|
502 |
-
data[n] = data[n].fillna(0)
|
503 |
-
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
504 |
-
|
505 |
-
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
506 |
-
|
507 |
-
def cumul_sum(col):
|
508 |
-
nums = []
|
509 |
-
s = 0
|
510 |
-
for x in col:
|
511 |
-
if x == 1:
|
512 |
-
s += 1
|
513 |
-
elif x == 0:
|
514 |
-
s = 0
|
515 |
-
nums.append(s)
|
516 |
-
return nums
|
517 |
-
|
518 |
-
consec_green = cumul_sum(data['GreenDay'].values)
|
519 |
-
consec_red = cumul_sum(data['RedDay'].values)
|
520 |
-
|
521 |
-
data['DaysGreen'] = consec_green
|
522 |
-
data['DaysRed'] = consec_red
|
523 |
-
|
524 |
-
final_row = data.index[-2]
|
525 |
-
|
526 |
-
exp_row = data.index[-1]
|
527 |
-
|
528 |
-
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
|
529 |
-
df_final = df_final.dropna(subset=['Target','Target_clf'])
|
530 |
-
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
531 |
-
return data, df_final, final_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_intra_v2.py
CHANGED
@@ -15,58 +15,70 @@ import lightgbm as lgb
|
|
15 |
from sklearn.model_selection import TimeSeriesSplit
|
16 |
from intraCols import model_cols
|
17 |
|
18 |
-
|
19 |
-
def walk_forward_validation(df, target_column, num_periods):
|
20 |
|
21 |
df = df[model_cols + [target_column]]
|
22 |
df[target_column] = df[target_column].astype(bool)
|
23 |
|
24 |
-
# Model
|
25 |
-
# model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
26 |
-
|
27 |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
|
49 |
-
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
|
57 |
-
try:
|
58 |
-
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
59 |
-
for q in df_q.index:
|
60 |
-
if q.left <= pct <= q.right:
|
61 |
-
p = df_q[q]
|
62 |
-
except:
|
63 |
-
p = None
|
64 |
|
65 |
-
greenprobas
|
66 |
|
67 |
-
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def seq_predict_proba(df, trained_clf_model):
|
72 |
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
|
|
15 |
from sklearn.model_selection import TimeSeriesSplit
|
16 |
from intraCols import model_cols
|
17 |
|
18 |
+
def walk_forward_validation(df, target_column, num_periods, mode='full'):
|
|
|
19 |
|
20 |
df = df[model_cols + [target_column]]
|
21 |
df[target_column] = df[target_column].astype(bool)
|
22 |
|
|
|
|
|
|
|
23 |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
24 |
|
25 |
+
if mode == 'full':
|
26 |
+
overall_results = []
|
27 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
28 |
+
# Split the time series data using TimeSeriesSplit
|
29 |
+
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
|
30 |
+
# Extract the training and testing data for the current split
|
31 |
+
X_train = df.drop(target_column, axis=1).iloc[train_index]
|
32 |
+
y_train = df[target_column].iloc[train_index]
|
33 |
+
X_test = df.drop(target_column, axis=1).iloc[test_index]
|
34 |
+
y_test = df[target_column].iloc[test_index]
|
35 |
+
|
36 |
+
y_train = y_train.astype(bool)
|
37 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
38 |
+
model.fit(X_train, y_train)
|
39 |
+
# Make a prediction on the test data
|
40 |
+
predictions = model.predict_proba(X_test)[:,-1]
|
41 |
+
|
42 |
+
# Create a DataFrame to store the true and predicted values
|
43 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
44 |
+
overall_results.append(result_df)
|
45 |
+
df_results = pd.concat(overall_results)
|
46 |
|
47 |
+
# Calibrate Probabilities
|
48 |
+
def get_quantiles(df, col_name, q):
|
49 |
+
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
50 |
|
51 |
+
greenprobas = []
|
52 |
+
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
|
53 |
+
try:
|
54 |
+
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
55 |
+
for q in df_q.index:
|
56 |
+
if q.left <= pct <= q.right:
|
57 |
+
p = df_q[q]
|
58 |
+
except:
|
59 |
+
p = None
|
60 |
|
61 |
+
greenprobas.append(p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
+
df_results['CalibPredicted'] = greenprobas
|
64 |
|
65 |
+
return df_results, model
|
66 |
|
67 |
+
elif mode == 'single':
|
68 |
+
X_train = df.drop(target_column, axis=1).iloc[:-1]
|
69 |
+
y_train = df[target_column].iloc[:-1]
|
70 |
+
X_test = df.drop(target_column, axis=1).iloc[-1]
|
71 |
+
y_test = df[target_column].iloc[-1]
|
72 |
+
y_train = y_train.astype(bool)
|
73 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
74 |
+
model.fit(X_train, y_train)
|
75 |
+
predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
|
76 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
|
77 |
+
|
78 |
+
return result_df, model
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
|
83 |
def seq_predict_proba(df, trained_clf_model):
|
84 |
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
requirements.txt
CHANGED
@@ -19,4 +19,5 @@ huggingface_hub
|
|
19 |
holidays
|
20 |
pytz
|
21 |
sqlalchemy<2.0
|
22 |
-
mysqlclient
|
|
|
|
19 |
holidays
|
20 |
pytz
|
21 |
sqlalchemy<2.0
|
22 |
+
mysqlclient
|
23 |
+
mplfinance
|
troubleshoot_day_model.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|