boomsss commited on
Commit
acdf71a
β€’
1 Parent(s): 835ace6
Files changed (11) hide show
  1. README.md +5 -0
  2. getDailyData.py +75 -27
  3. getIntraData.py +5 -5
  4. model_1h.py +0 -481
  5. model_30m.py +0 -506
  6. model_90m.py +0 -481
  7. model_day.py +0 -434
  8. model_intra.py +0 -531
  9. model_intra_v2.py +52 -40
  10. requirements.txt +2 -1
  11. troubleshoot_day_model.ipynb +0 -0
README.md CHANGED
@@ -9,6 +9,11 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
12
  # TL;DR on WTF
13
  - The purpose of this project is to predict whether the current day's close will be above the previous day's close (`Target`).
14
  - Predictions are produced through generalized stacking of an ensemble of 2 models.
 
9
  pinned: false
10
  ---
11
 
12
+ # Gameday SPX: An ML Approach to Predicting πŸŸ₯ or 🟩
13
+ Given the proliferation of trading styles widely available today to retail investors, there are so many different ways to make money in today's market. This model is an effort to simplify my personal trading style by gathering most of the features that I would consider important, feed them to a model and get an output for whether the day will be green for $SPX (higher than yesterday's close) or red (lower than yesterday's close).
14
+
15
+ This is specific for $SPX, because I primarily trade this index. Justification for
16
+
17
  # TL;DR on WTF
18
  - The purpose of this project is to predict whether the current day's close will be above the previous day's close (`Target`).
19
  - Predictions are produced through generalized stacking of an ensemble of 2 models.
getDailyData.py CHANGED
@@ -9,7 +9,10 @@ from tqdm import tqdm
9
  import os
10
  import datetime
11
  import json
 
12
  from sqlalchemy import create_engine
 
 
13
 
14
  data_start_date = '2018-07-01'
15
 
@@ -24,37 +27,82 @@ def get_daily(mode='daily', periods_30m=None):
24
  vvix = yf.Ticker('^VVIX')
25
  spx = yf.Ticker('^GSPC')
26
 
27
- prices_vix = vix.history(start=data_start_date, interval='1d')
28
- prices_vvix = vvix.history(start=data_start_date, interval='1d')
29
- prices_spx = spx.history(start=data_start_date, interval='1d')
30
-
31
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
32
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
33
- prices_spx.index = prices_spx['index']
34
- prices_spx = prices_spx.drop(columns='index')
35
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
36
-
37
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
38
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
39
- prices_vix.index = prices_vix['index']
40
- prices_vix = prices_vix.drop(columns='index')
41
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
42
-
43
- prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
44
- prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
45
- prices_vvix.index = prices_vvix['index']
46
- prices_vvix = prices_vvix.drop(columns='index')
47
- prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if mode == 'intra':
50
  from getIntraData import get_intra
51
  df_intra = get_intra(periods_30m)
52
- data = prices_spx.merge(df_intra, left_index=True, right_index=True)
53
  else:
54
- data = prices_spx.copy()
55
-
56
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
57
- data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
58
 
59
  # Features
60
  data['PrevClose'] = data['Close'].shift(1)
 
9
  import os
10
  import datetime
11
  import json
12
+ from pandas.tseries.offsets import BDay
13
  from sqlalchemy import create_engine
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
 
17
  data_start_date = '2018-07-01'
18
 
 
27
  vvix = yf.Ticker('^VVIX')
28
  spx = yf.Ticker('^GSPC')
29
 
30
+ # Grab data from db
31
+ engine = create_engine(
32
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
33
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
34
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
35
+ )
36
+
37
+ query = f'''SELECT
38
+ spx.Datetime AS Datetime,
39
+ spx.Open AS Open,
40
+ spx.High AS High,
41
+ spx.Low AS Low,
42
+ spx.Close AS Close,
43
+ vix.Open AS Open_VIX,
44
+ vix.High AS High_VIX,
45
+ vix.Low AS Low_VIX,
46
+ vix.Close AS Close_VIX,
47
+ vvix.Open AS Open_VVIX,
48
+ vvix.High AS High_VVIX,
49
+ vvix.Low AS Low_VVIX,
50
+ vvix.Close AS Close_VVIX
51
+ FROM
52
+ SPX_full_1day AS spx
53
+ LEFT JOIN
54
+ VIX_full_1day AS vix ON spx.Datetime = vix.Datetime AND vix.Datetime > '{data_start_date}'
55
+ LEFT JOIN
56
+ VVIX_full_1day AS vvix ON spx.Datetime = vvix.Datetime AND vvix.Datetime > '{data_start_date}'
57
+ WHERE
58
+ spx.Datetime > '{data_start_date}'
59
+
60
+ '''
61
+ data = pd.read_sql_query(sql=query, con=engine.connect())
62
+ data['Datetime'] = pd.to_datetime(data['Datetime'])
63
+ data = data.set_index('Datetime',drop=True)
64
+
65
+ # Get incremental date
66
+ last_date = data.index.date[-1]
67
+ last_date = last_date + BDay(1)
68
+
69
+ prices_vix = vix.history(start=last_date, interval='1d')
70
+ prices_vvix = vvix.history(start=last_date, interval='1d')
71
+ prices_spx = spx.history(start=last_date, interval='1d')
72
+
73
+ if len(prices_spx) > 0:
74
+
75
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
76
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
77
+ prices_spx.index = prices_spx['index']
78
+ prices_spx = prices_spx.drop(columns='index')
79
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
80
+
81
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
82
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
83
+ prices_vix.index = prices_vix['index']
84
+ prices_vix = prices_vix.drop(columns='index')
85
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
86
+
87
+ prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
88
+ prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
89
+ prices_vvix.index = prices_vvix['index']
90
+ prices_vvix = prices_vvix.drop(columns='index')
91
+ prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
92
+
93
+ data1 = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
94
+ data1 = data1.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
95
+ data = pd.concat([data, data1])
96
+
97
+ else:
98
+ data = data.copy()
99
+
100
  if mode == 'intra':
101
  from getIntraData import get_intra
102
  df_intra = get_intra(periods_30m)
103
+ data = data.merge(df_intra, left_index=True, right_index=True)
104
  else:
105
+ data = data.copy()
 
 
 
106
 
107
  # Features
108
  data['PrevClose'] = data['Close'].shift(1)
getIntraData.py CHANGED
@@ -6,10 +6,10 @@ import datetime
6
  from sqlalchemy import create_engine
7
  import os
8
  from getDailyData import data_start_date
9
- # from dotenv import load_dotenv
10
 
11
  # Load environment variables from the .env file
12
- # load_dotenv()
13
 
14
  def get_intra(periods_30m = 1):
15
  '''
@@ -38,11 +38,11 @@ def get_intra(periods_30m = 1):
38
  FROM
39
  SPX_full_30min AS spx30
40
  LEFT JOIN
41
- VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > {data_start_date}
42
  LEFT JOIN
43
- VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > {data_start_date}
44
  WHERE
45
- spx30.Datetime > {data_start_date}
46
 
47
  '''
48
  # spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
 
6
  from sqlalchemy import create_engine
7
  import os
8
  from getDailyData import data_start_date
9
+ from dotenv import load_dotenv
10
 
11
  # Load environment variables from the .env file
12
+ load_dotenv()
13
 
14
  def get_intra(periods_30m = 1):
15
  '''
 
38
  FROM
39
  SPX_full_30min AS spx30
40
  LEFT JOIN
41
+ VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > '{data_start_date}'
42
  LEFT JOIN
43
+ VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > '{data_start_date}'
44
  WHERE
45
+ spx30.Datetime > '{data_start_date}'
46
 
47
  '''
48
  # spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
model_1h.py DELETED
@@ -1,481 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
- import lightgbm as lgb
20
-
21
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
22
-
23
- # Create an XGBRegressor model
24
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
25
- model = linear_model.LinearRegression()
26
-
27
- overall_results = []
28
- # Iterate over the rows in the DataFrame, one step at a time
29
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
30
- # Split the data into training and test sets
31
- X_train = df.drop(target_column, axis=1).iloc[:i]
32
- y_train = df[target_column].iloc[:i]
33
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
34
- y_test = df[target_column].iloc[i:i+num_periods]
35
-
36
- # Fit the model to the training data
37
- model.fit(X_train, y_train)
38
-
39
- # Make a prediction on the test data
40
- predictions = model.predict(X_test)
41
-
42
- # Create a DataFrame to store the true and predicted values
43
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
-
45
- overall_results.append(result_df)
46
-
47
- df_results = pd.concat(overall_results)
48
- # model.save_model('model_lr.bin')
49
- # Return the true and predicted values, and fitted model
50
- return df_results, model
51
-
52
- model_cols = [
53
- 'BigNewsDay',
54
- 'Quarter',
55
- 'Perf5Day',
56
- 'Perf5Day_n1',
57
- 'DaysGreen',
58
- 'DaysRed',
59
- 'CurrentHigh30toClose',
60
- 'CurrentLow30toClose',
61
- 'CurrentClose30toClose',
62
- 'CurrentRange30',
63
- 'GapFill30',
64
- 'CurrentGap',
65
- 'RangePct',
66
- 'RangePct_n1',
67
- 'RangePct_n2',
68
- 'OHLC4_VIX',
69
- 'OHLC4_VIX_n1',
70
- 'OHLC4_VIX_n2',
71
- 'OpenL1',
72
- 'OpenL2',
73
- 'OpenH1',
74
- 'OpenH2',
75
- 'L1TouchPct',
76
- 'L2TouchPct',
77
- 'H1TouchPct',
78
- 'H2TouchPct',
79
- 'L1BreakPct',
80
- 'L2BreakPct',
81
- 'H1BreakPct',
82
- 'H2BreakPct',
83
- 'GreenProbas',
84
- # 'GapFillGreenProba'
85
-
86
- ]
87
-
88
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
89
-
90
- # Create run the regression model to get its target
91
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
92
- # joblib.dump(model1, 'model1.bin')
93
-
94
- # Merge the result df back on the df for feeding into the classifier
95
- for_merge = res[['Predicted']]
96
- for_merge.columns = ['RegrModelOut']
97
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
98
- df = df.merge(for_merge, left_index=True, right_index=True)
99
- df = df.drop(columns=[target_column_regr])
100
- df = df[model_cols + ['RegrModelOut', target_column_clf]]
101
-
102
- df[target_column_clf] = df[target_column_clf].astype(bool)
103
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
104
-
105
- # Create an XGBRegressor model
106
- # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
107
- model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
108
- # model = linear_model.LogisticRegression(max_iter=1500)
109
-
110
- overall_results = []
111
- # Iterate over the rows in the DataFrame, one step at a time
112
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
113
- # Split the data into training and test sets
114
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
115
- y_train = df[target_column_clf].iloc[:i]
116
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
117
- y_test = df[target_column_clf].iloc[i:i+num_periods]
118
-
119
- # Fit the model to the training data
120
- model2.fit(X_train, y_train)
121
-
122
- # Make a prediction on the test data
123
- predictions = model2.predict_proba(X_test)[:,-1]
124
-
125
- # Create a DataFrame to store the true and predicted values
126
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
127
-
128
- overall_results.append(result_df)
129
-
130
- df_results = pd.concat(overall_results)
131
- # model1.save_model('model_ensemble.bin')
132
- # joblib.dump(model2, 'model2.bin')
133
- # Return the true and predicted values, and fitted model
134
- return df_results, model1, model2
135
-
136
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
137
- regr_pred = trained_reg_model.predict(df)
138
- regr_pred = regr_pred > 0
139
- new_df = df.copy()
140
- new_df['RegrModelOut'] = regr_pred
141
- clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
142
- return clf_pred_proba
143
-
144
- def get_data():
145
- # f = open('settings.json')
146
- # j = json.load(f)
147
- # API_KEY_FRED = j["API_KEY_FRED"]
148
-
149
- API_KEY_FRED = os.getenv('API_KEY_FRED')
150
-
151
- def parse_release_dates(release_id: str) -> List[str]:
152
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
153
- r = requests.get(release_dates_url)
154
- text = r.text
155
- soup = BeautifulSoup(text, 'xml')
156
- dates = []
157
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
158
- dates.append(release_date_tag.text)
159
- return dates
160
-
161
- def parse_release_dates_obs(series_id: str) -> List[str]:
162
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
163
- r = requests.get(obs_url)
164
- text = r.text
165
- soup = BeautifulSoup(text, 'xml')
166
- observations = []
167
- for observation_tag in soup.find_all('observation'):
168
- date = observation_tag.get('date')
169
- value = observation_tag.get('value')
170
- observations.append((date, value))
171
- return observations
172
-
173
- econ_dfs = {}
174
-
175
- econ_tickers = [
176
- 'WALCL',
177
- 'NFCI',
178
- 'WRESBAL'
179
- ]
180
-
181
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
182
- # p = parse_release_dates_obs(et)
183
- # df = pd.DataFrame(columns = ['ds',et], data = p)
184
- df = pdr.get_data_fred(et)
185
- df.index = df.index.rename('ds')
186
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
187
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
188
- econ_dfs[et] = df
189
-
190
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
191
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
192
-
193
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
194
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
195
-
196
- release_ids = [
197
- "10", # "Consumer Price Index"
198
- "46", # "Producer Price Index"
199
- "50", # "Employment Situation"
200
- "53", # "Gross Domestic Product"
201
- "103", # "Discount Rate Meeting Minutes"
202
- "180", # "Unemployment Insurance Weekly Claims Report"
203
- "194", # "ADP National Employment Report"
204
- "323" # "Trimmed Mean PCE Inflation Rate"
205
- ]
206
-
207
- release_names = [
208
- "CPI",
209
- "PPI",
210
- "NFP",
211
- "GDP",
212
- "FOMC",
213
- "UNEMP",
214
- "ADP",
215
- "PCE"
216
- ]
217
-
218
- releases = {}
219
-
220
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
221
- releases[rid] = {}
222
- releases[rid]['dates'] = parse_release_dates(rid)
223
- releases[rid]['name'] = n
224
-
225
- # Create a DF that has all dates with the name of the col as 1
226
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
227
- # This column serves as the true/false indicator of whether there was economic data released that day.
228
- for rid in tqdm(release_ids, desc='Making indicators'):
229
- releases[rid]['df'] = pd.DataFrame(
230
- index=releases[rid]['dates'],
231
- data={
232
- releases[rid]['name']: 1
233
- })
234
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
235
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
236
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
237
-
238
- vix = yf.Ticker('^VIX')
239
- spx = yf.Ticker('^GSPC')
240
-
241
-
242
- # Pull in data
243
- data = load_dataset("boomsss/spx_intra", split='train')
244
-
245
- rows = [d['text'] for d in data]
246
- rows = [x.split(',') for x in rows]
247
-
248
- fr = pd.DataFrame(columns=[
249
- 'Datetime','Open','High','Low','Close'
250
- ], data = rows)
251
-
252
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
253
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
254
- fr = fr.set_index('Datetime')
255
- fr['Open'] = pd.to_numeric(fr['Open'])
256
- fr['High'] = pd.to_numeric(fr['High'])
257
- fr['Low'] = pd.to_numeric(fr['Low'])
258
- fr['Close'] = pd.to_numeric(fr['Close'])
259
-
260
- # Get incremental date
261
- last_date = fr.index.date[-1]
262
- last_date = last_date + datetime.timedelta(days=1)
263
- # Get incremental data
264
- spx1 = yf.Ticker('^GSPC')
265
- yfp = spx1.history(start=last_date, interval='30m')
266
-
267
- if len(yfp) > 0:
268
- # Concat current and incremental
269
- df_30m = pd.concat([fr, yfp])
270
- else:
271
- df_30m = fr.copy()
272
-
273
- # Get the first 30 minute bar
274
- df_30m = df_30m.reset_index()
275
- df_30m['Datetime'] = df_30m['Datetime'].dt.date
276
- df_30m = df_30m.groupby('Datetime').head(2)
277
- df_30m = df_30m.set_index('Datetime',drop=True)
278
- # Rename the columns
279
- df_30m = df_30m[['Open','High','Low','Close']]
280
-
281
- opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
282
- highs_1h = df_30m.groupby('Datetime')['High'].max()
283
- lows_1h = df_30m.groupby('Datetime')['Low'].min()
284
- closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
285
-
286
- df_1h = pd.DataFrame(index=df_30m.index.unique())
287
- df_1h['Open'] = opens_1h
288
- df_1h['High'] = highs_1h
289
- df_1h['Low'] = lows_1h
290
- df_1h['Close'] = closes_1h
291
-
292
- df_1h.columns = ['Open30','High30','Low30','Close30']
293
-
294
- prices_vix = vix.history(start='2018-07-01', interval='1d')
295
- prices_spx = spx.history(start='2018-07-01', interval='1d')
296
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
297
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
298
- prices_spx.index = prices_spx['index']
299
- prices_spx = prices_spx.drop(columns='index')
300
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
301
-
302
-
303
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
304
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
305
- prices_vix.index = prices_vix['index']
306
- prices_vix = prices_vix.drop(columns='index')
307
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
308
-
309
-
310
- data = prices_spx.merge(df_1h, left_index=True, right_index=True)
311
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
312
-
313
- # Features
314
- data['PrevClose'] = data['Close'].shift(1)
315
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
316
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
317
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
318
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
319
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
320
-
321
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
322
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
323
-
324
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
325
- data['RangePct'] = data['Range'] / data['Close']
326
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
327
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
328
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
329
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
330
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
331
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
332
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
333
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
334
- data['RangePct_n1'] = data['RangePct'].shift(1)
335
- data['RangePct_n2'] = data['RangePct'].shift(2)
336
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
337
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
338
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
339
- data['CurrentGapHist'] = data['CurrentGap'].copy()
340
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
341
- data['DayOfWeek'] = pd.to_datetime(data.index)
342
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
343
-
344
- # Calculate up
345
- data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
346
-
347
- # Calculate upSD
348
- data['upSD'] = data['up'].rolling(30).std(ddof=0)
349
-
350
- # Calculate aveUp
351
- data['aveUp'] = data['up'].rolling(30).mean()
352
- data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
353
- data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
354
- data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
355
- data['downSD'] = data['down'].rolling(30).std(ddof=0)
356
- data['aveDown'] = data['down'].rolling(30).mean()
357
- data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
358
- data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
359
-
360
- data = data.assign(
361
- L1Touch = lambda x: x['Low'] < x['L1'],
362
- L2Touch = lambda x: x['Low'] < x['L2'],
363
- H1Touch = lambda x: x['High'] > x['H1'],
364
- H2Touch = lambda x: x['High'] > x['H2'],
365
- L1Break = lambda x: x['Close'] < x['L1'],
366
- L2Break = lambda x: x['Close'] < x['L2'],
367
- H1Break = lambda x: x['Close'] > x['H1'],
368
- H2Break = lambda x: x['Close'] > x['H2'],
369
- OpenL1 = lambda x: x['Open'] / x['L1'],
370
- OpenL2 = lambda x: x['Open'] / x['L2'],
371
- OpenH1 = lambda x: x['Open'] / x['H1'],
372
- OpenH2 = lambda x: x['Open'] / x['H2']
373
- )
374
-
375
- level_cols = [
376
- 'L1Touch',
377
- 'L2Touch',
378
- 'H1Touch',
379
- 'H2Touch',
380
- 'L1Break',
381
- 'L2Break',
382
- 'H1Break',
383
- 'H2Break'
384
- ]
385
-
386
- for col in level_cols:
387
- data[col+'Pct'] = data[col].rolling(100).mean()
388
- data[col+'Pct'] = data[col+'Pct'].shift(-1)
389
-
390
- # Intraday features
391
- data['CurrentHigh30'] = data['High30'].shift(-1)
392
- data['CurrentLow30'] = data['Low30'].shift(-1)
393
- data['CurrentClose30'] = data['Close30'].shift(-1)
394
- data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
395
-
396
- # Open to High
397
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
398
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
399
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
400
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
401
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
402
-
403
- # Target -- the next day's low
404
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
405
- data['Target'] = data['Target'].shift(-1)
406
- # data['Target'] = data['RangePct'].shift(-1)
407
-
408
- # Target for clf -- whether tomorrow will close above or below today's close
409
- data['Target_clf'] = data['Close'] > data['PrevClose']
410
- data['Target_clf'] = data['Target_clf'].shift(-1)
411
- data['DayOfWeek'] = pd.to_datetime(data.index)
412
- data['Quarter'] = data['DayOfWeek'].dt.quarter
413
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
414
-
415
- def get_quintiles(df, col_name, q):
416
- return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
417
-
418
- probas = []
419
- for i, pct in enumerate(data['CurrentClose30toClose']):
420
- try:
421
- df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
422
- for q in df_q.index:
423
- if q.left <= pct <= q.right:
424
- p = df_q[q]
425
- except:
426
- p = None
427
-
428
- probas.append(p)
429
-
430
- # gapfills = []
431
- # for i, pct in enumerate(data['CurrentGap']):
432
- # try:
433
- # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
434
- # for q in df_q.index:
435
- # if q.left <= pct <= q.right:
436
- # p = df_q[q]
437
- # except:
438
- # p = None
439
-
440
- # gapfills.append(p)
441
-
442
- data['GreenProbas'] = probas
443
- # data['GapFillGreenProba'] = gapfills
444
-
445
- for rid in tqdm(release_ids, desc='Merging econ data'):
446
- # Get the name of the release
447
- n = releases[rid]['name']
448
- # Merge the corresponding DF of the release
449
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
450
- # Create a column that shifts the value in the merged column up by 1
451
- data[f'{n}_shift'] = data[n].shift(-1)
452
- # Fill the rest with zeroes
453
- data[n] = data[n].fillna(0)
454
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
455
-
456
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
457
-
458
- def cumul_sum(col):
459
- nums = []
460
- s = 0
461
- for x in col:
462
- if x == 1:
463
- s += 1
464
- elif x == 0:
465
- s = 0
466
- nums.append(s)
467
- return nums
468
-
469
- consec_green = cumul_sum(data['GreenDay'].values)
470
- consec_red = cumul_sum(data['RedDay'].values)
471
-
472
- data['DaysGreen'] = consec_green
473
- data['DaysRed'] = consec_red
474
-
475
- final_row = data.index[-2]
476
-
477
- exp_row = data.index[-1]
478
-
479
- df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
480
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
481
- return data, df_final, final_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_30m.py DELETED
@@ -1,506 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
- import lightgbm as lgb
20
-
21
- # If the dataset is gated/private, make sure you have run huggingface-cli login
22
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
23
-
24
- # Create an XGBRegressor model
25
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
26
- model = linear_model.LinearRegression()
27
-
28
- overall_results = []
29
- # Iterate over the rows in the DataFrame, one step at a time
30
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
31
- # Split the data into training and test sets
32
- X_train = df.drop(target_column, axis=1).iloc[:i]
33
- y_train = df[target_column].iloc[:i]
34
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
35
- y_test = df[target_column].iloc[i:i+num_periods]
36
-
37
- # Fit the model to the training data
38
- model.fit(X_train, y_train)
39
-
40
- # Make a prediction on the test data
41
- predictions = model.predict(X_test)
42
-
43
- # Create a DataFrame to store the true and predicted values
44
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
45
-
46
- overall_results.append(result_df)
47
-
48
- df_results = pd.concat(overall_results)
49
- # model.save_model('model_lr.bin')
50
- # Return the true and predicted values, and fitted model
51
- return df_results, model
52
-
53
- model_cols = [
54
- 'BigNewsDay',
55
- 'Quarter',
56
- 'Perf5Day',
57
- 'Perf5Day_n1',
58
- 'DaysGreen',
59
- 'DaysRed',
60
- 'CurrentHigh30toClose',
61
- 'CurrentLow30toClose',
62
- 'CurrentClose30toClose',
63
- 'CurrentRange30',
64
- 'GapFill30',
65
- 'CurrentGap',
66
- 'RangePct',
67
- 'RangePct_n1',
68
- 'RangePct_n2',
69
- 'OHLC4_VIX',
70
- 'OHLC4_VIX_n1',
71
- 'OHLC4_VIX_n2',
72
- 'OpenL1',
73
- 'OpenL2',
74
- 'OpenH1',
75
- 'OpenH2',
76
- 'L1TouchPct',
77
- 'L2TouchPct',
78
- 'H1TouchPct',
79
- 'H2TouchPct',
80
- 'L1BreakPct',
81
- 'L2BreakPct',
82
- 'H1BreakPct',
83
- 'H2BreakPct',
84
- 'GreenProbas',
85
- # 'GapFillGreenProba'
86
- ]
87
-
88
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
89
-
90
- # Create run the regression model to get its target
91
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
92
- # joblib.dump(model1, 'model1.bin')
93
-
94
- # Merge the result df back on the df for feeding into the classifier
95
- for_merge = res[['Predicted']]
96
- for_merge.columns = ['RegrModelOut']
97
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
98
- df = df.merge(for_merge, left_index=True, right_index=True)
99
- df = df.drop(columns=[target_column_regr])
100
- df = df[model_cols + ['RegrModelOut', target_column_clf]]
101
-
102
- df[target_column_clf] = df[target_column_clf].astype(bool)
103
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
104
-
105
- # Create an XGBRegressor model
106
- # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
107
- model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
108
- # model = linear_model.LogisticRegression(max_iter=1500)
109
-
110
- overall_results = []
111
- # Iterate over the rows in the DataFrame, one step at a time
112
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
113
- # Split the data into training and test sets
114
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
115
- y_train = df[target_column_clf].iloc[:i]
116
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
117
- y_test = df[target_column_clf].iloc[i:i+num_periods]
118
-
119
- # Fit the model to the training data
120
- model2.fit(X_train, y_train)
121
-
122
- # Make a prediction on the test data
123
- predictions = model2.predict_proba(X_test)[:,-1]
124
-
125
- # Create a DataFrame to store the true and predicted values
126
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
127
-
128
- overall_results.append(result_df)
129
-
130
- df_results = pd.concat(overall_results)
131
-
132
- # Calibrate Probabilities
133
- def get_quantiles(df, col_name, q):
134
- return df.groupby(pd.qcut(df[col_name], q))['True'].mean()
135
-
136
- greenprobas = []
137
- meanprobas = []
138
- for i, pct in enumerate(df_results['Predicted']):
139
- try:
140
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
141
- for q in df_q.index:
142
- if q.left <= pct <= q.right:
143
- p = df_q[q]
144
- c = (q.left + q.right) / 2
145
- except:
146
- p = None
147
- c = None
148
-
149
- greenprobas.append(p)
150
- meanprobas.append(c)
151
-
152
- df_results['CalibPredicted'] = meanprobas
153
- df_results['CalibGreenProba'] = greenprobas
154
-
155
- return df_results, model1, model2
156
-
157
-
158
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
159
- regr_pred = trained_reg_model.predict(df)
160
- regr_pred = regr_pred > 0
161
- new_df = df.copy()
162
- new_df['RegrModelOut'] = regr_pred
163
- clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
164
- return clf_pred_proba
165
-
166
- def get_data():
167
- # f = open('settings.json')
168
- # j = json.load(f)
169
- # API_KEY_FRED = j["API_KEY_FRED"]
170
-
171
- API_KEY_FRED = os.getenv('API_KEY_FRED')
172
-
173
- def parse_release_dates(release_id: str) -> List[str]:
174
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
175
- r = requests.get(release_dates_url)
176
- text = r.text
177
- soup = BeautifulSoup(text, 'xml')
178
- dates = []
179
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
180
- dates.append(release_date_tag.text)
181
- return dates
182
-
183
- def parse_release_dates_obs(series_id: str) -> List[str]:
184
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
185
- r = requests.get(obs_url)
186
- text = r.text
187
- soup = BeautifulSoup(text, 'xml')
188
- observations = []
189
- for observation_tag in soup.find_all('observation'):
190
- date = observation_tag.get('date')
191
- value = observation_tag.get('value')
192
- observations.append((date, value))
193
- return observations
194
-
195
- econ_dfs = {}
196
-
197
- econ_tickers = [
198
- 'WALCL',
199
- 'NFCI',
200
- 'WRESBAL'
201
- ]
202
-
203
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
204
- # p = parse_release_dates_obs(et)
205
- # df = pd.DataFrame(columns = ['ds',et], data = p)
206
- df = pdr.get_data_fred(et)
207
- df.index = df.index.rename('ds')
208
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
209
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
210
- econ_dfs[et] = df
211
-
212
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
213
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
214
-
215
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
216
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
217
-
218
- release_ids = [
219
- "10", # "Consumer Price Index"
220
- "46", # "Producer Price Index"
221
- "50", # "Employment Situation"
222
- "53", # "Gross Domestic Product"
223
- "103", # "Discount Rate Meeting Minutes"
224
- "180", # "Unemployment Insurance Weekly Claims Report"
225
- "194", # "ADP National Employment Report"
226
- "323" # "Trimmed Mean PCE Inflation Rate"
227
- ]
228
-
229
- release_names = [
230
- "CPI",
231
- "PPI",
232
- "NFP",
233
- "GDP",
234
- "FOMC",
235
- "UNEMP",
236
- "ADP",
237
- "PCE"
238
- ]
239
-
240
- releases = {}
241
-
242
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
243
- releases[rid] = {}
244
- releases[rid]['dates'] = parse_release_dates(rid)
245
- releases[rid]['name'] = n
246
-
247
- # Create a DF that has all dates with the name of the col as 1
248
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
249
- # This column serves as the true/false indicator of whether there was economic data released that day.
250
- for rid in tqdm(release_ids, desc='Making indicators'):
251
- releases[rid]['df'] = pd.DataFrame(
252
- index=releases[rid]['dates'],
253
- data={
254
- releases[rid]['name']: 1
255
- })
256
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
257
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
258
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
259
-
260
- vix = yf.Ticker('^VIX')
261
- spx = yf.Ticker('^GSPC')
262
-
263
- # Pull in data
264
- data = load_dataset("boomsss/spx_intra", split='train')
265
-
266
- rows = [d['text'] for d in data]
267
- rows = [x.split(',') for x in rows]
268
-
269
- fr = pd.DataFrame(columns=[
270
- 'Datetime','Open','High','Low','Close'
271
- ], data = rows)
272
-
273
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
274
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
275
- fr = fr.set_index('Datetime')
276
- fr['Open'] = pd.to_numeric(fr['Open'])
277
- fr['High'] = pd.to_numeric(fr['High'])
278
- fr['Low'] = pd.to_numeric(fr['Low'])
279
- fr['Close'] = pd.to_numeric(fr['Close'])
280
-
281
- # Get incremental date
282
- last_date = fr.index.date[-1]
283
- last_date = last_date + datetime.timedelta(days=1)
284
- # Get incremental data
285
- spx1 = yf.Ticker('^GSPC')
286
- yfp = spx1.history(start=last_date, interval='30m')
287
-
288
- if len(yfp) > 0:
289
- # Concat current and incremental
290
- df_30m = pd.concat([fr, yfp])
291
- else:
292
- df_30m = fr.copy()
293
-
294
- # Get the first 30 minute bar
295
- df_30m = df_30m.reset_index()
296
- df_30m['Datetime'] = df_30m['Datetime'].dt.date
297
- df_30m = df_30m.groupby('Datetime').head(1)
298
- df_30m = df_30m.set_index('Datetime',drop=True)
299
- # Rename the columns
300
- df_30m = df_30m[['Open','High','Low','Close']]
301
- df_30m.columns = ['Open30','High30','Low30','Close30']
302
-
303
- prices_vix = vix.history(start='2018-07-01', interval='1d')
304
- prices_spx = spx.history(start='2018-07-01', interval='1d')
305
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
306
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
307
- prices_spx.index = prices_spx['index']
308
- prices_spx = prices_spx.drop(columns='index')
309
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
310
-
311
-
312
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
313
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
314
- prices_vix.index = prices_vix['index']
315
- prices_vix = prices_vix.drop(columns='index')
316
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
317
-
318
-
319
- data = prices_spx.merge(df_30m, left_index=True, right_index=True)
320
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
321
-
322
- # Features
323
- data['PrevClose'] = data['Close'].shift(1)
324
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
325
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
326
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
327
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
328
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
329
-
330
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
331
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
332
-
333
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
334
- data['RangePct'] = data['Range'] / data['Close']
335
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
336
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
337
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
338
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
339
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
340
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
341
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
342
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
343
- data['RangePct_n1'] = data['RangePct'].shift(1)
344
- data['RangePct_n2'] = data['RangePct'].shift(2)
345
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
346
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
347
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
348
- data['CurrentGapHist'] = data['CurrentGap'].copy()
349
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
350
- data['DayOfWeek'] = pd.to_datetime(data.index)
351
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
352
-
353
- # Intraday features
354
- data['CurrentOpen30'] = data['Open30'].shift(-1)
355
- data['CurrentHigh30'] = data['High30'].shift(-1)
356
- data['CurrentLow30'] = data['Low30'].shift(-1)
357
- data['CurrentClose30'] = data['Close30'].shift(-1)
358
- data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
359
-
360
-
361
- # Open to High
362
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
363
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
364
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
365
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
366
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
367
-
368
- # Target -- the next day's low
369
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
370
- data['Target'] = data['Target'].shift(-1)
371
- # data['Target'] = data['RangePct'].shift(-1)
372
-
373
- # Target for clf -- whether tomorrow will close above or below today's close
374
- data['Target_clf'] = data['Close'] > data['PrevClose']
375
- data['Target_clf'] = data['Target_clf'].shift(-1)
376
- data['DayOfWeek'] = pd.to_datetime(data.index)
377
- data['Quarter'] = data['DayOfWeek'].dt.quarter
378
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
379
-
380
- # Calculate up
381
- data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
382
-
383
- # Calculate upSD
384
- data['upSD'] = data['up'].rolling(30).std(ddof=0)
385
-
386
- # Calculate aveUp
387
- data['aveUp'] = data['up'].rolling(30).mean()
388
- data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
389
- data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
390
- data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
391
- data['downSD'] = data['down'].rolling(30).std(ddof=0)
392
- data['aveDown'] = data['down'].rolling(30).mean()
393
- data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
394
- data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
395
-
396
- data = data.assign(
397
- L1Touch = lambda x: x['Low'] < x['L1'],
398
- L2Touch = lambda x: x['Low'] < x['L2'],
399
- H1Touch = lambda x: x['High'] > x['H1'],
400
- H2Touch = lambda x: x['High'] > x['H2'],
401
- L1Break = lambda x: x['Close'] < x['L1'],
402
- L2Break = lambda x: x['Close'] < x['L2'],
403
- H1Break = lambda x: x['Close'] > x['H1'],
404
- H2Break = lambda x: x['Close'] > x['H2'],
405
- OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
406
- OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
407
- OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
408
- OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
409
- CloseL1 = lambda x: np.where(x['Close'] < x['L1'], 1, 0),
410
- CloseL2 = lambda x: np.where(x['Close'] < x['L2'], 1, 0),
411
- CloseH1 = lambda x: np.where(x['Close'] > x['H1'], 1, 0),
412
- CloseH2 = lambda x: np.where(x['Close'] > x['H2'], 1, 0)
413
- )
414
-
415
- data['OpenL1'] = data['OpenL1'].shift(-1)
416
- data['OpenL2'] = data['OpenL2'].shift(-1)
417
- data['OpenH1'] = data['OpenH1'].shift(-1)
418
- data['OpenH2'] = data['OpenH2'].shift(-1)
419
- data['CloseL1'] = data['CloseL1'].shift(-1)
420
- data['CloseL2'] = data['CloseL2'].shift(-1)
421
- data['CloseH1'] = data['CloseH1'].shift(-1)
422
- data['CloseH2'] = data['CloseH2'].shift(-1)
423
-
424
- level_cols = [
425
- 'L1Touch',
426
- 'L2Touch',
427
- 'H1Touch',
428
- 'H2Touch',
429
- 'L1Break',
430
- 'L2Break',
431
- 'H1Break',
432
- 'H2Break'
433
- ]
434
-
435
- for col in level_cols:
436
- data[col+'Pct'] = data[col].rolling(100).mean()
437
- data[col+'Pct'] = data[col+'Pct'].shift(-1)
438
-
439
-
440
- def get_quintiles(df, col_name, q):
441
- return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
442
-
443
- probas = []
444
- for i, pct in enumerate(data['CurrentClose30toClose']):
445
- try:
446
- df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
447
- for q in df_q.index:
448
- if q.left <= pct <= q.right:
449
- p = df_q[q]
450
- except:
451
- p = None
452
-
453
- probas.append(p)
454
-
455
- # gapfills = []
456
- # for i, pct in enumerate(data['CurrentGap']):
457
- # try:
458
- # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
459
- # for q in df_q.index:
460
- # if q.left <= pct <= q.right:
461
- # p = df_q[q]
462
- # except:
463
- # p = None
464
-
465
- # gapfills.append(p)
466
-
467
- data['GreenProbas'] = probas
468
- # data['GapFillGreenProba'] = gapfills
469
-
470
- for rid in tqdm(release_ids, desc='Merging econ data'):
471
- # Get the name of the release
472
- n = releases[rid]['name']
473
- # Merge the corresponding DF of the release
474
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
475
- # Create a column that shifts the value in the merged column up by 1
476
- data[f'{n}_shift'] = data[n].shift(-1)
477
- # Fill the rest with zeroes
478
- data[n] = data[n].fillna(0)
479
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
480
-
481
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
482
-
483
- def cumul_sum(col):
484
- nums = []
485
- s = 0
486
- for x in col:
487
- if x == 1:
488
- s += 1
489
- elif x == 0:
490
- s = 0
491
- nums.append(s)
492
- return nums
493
-
494
- consec_green = cumul_sum(data['GreenDay'].values)
495
- consec_red = cumul_sum(data['RedDay'].values)
496
-
497
- data['DaysGreen'] = consec_green
498
- data['DaysRed'] = consec_red
499
-
500
- final_row = data.index[-2]
501
-
502
- exp_row = data.index[-1]
503
-
504
- df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
505
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
506
- return data, df_final, final_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_90m.py DELETED
@@ -1,481 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- from datasets import load_dataset
19
- import lightgbm as lgb
20
-
21
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
22
-
23
- # Create an XGBRegressor model
24
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
25
- model = linear_model.LinearRegression()
26
-
27
- overall_results = []
28
- # Iterate over the rows in the DataFrame, one step at a time
29
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
30
- # Split the data into training and test sets
31
- X_train = df.drop(target_column, axis=1).iloc[:i]
32
- y_train = df[target_column].iloc[:i]
33
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
34
- y_test = df[target_column].iloc[i:i+num_periods]
35
-
36
- # Fit the model to the training data
37
- model.fit(X_train, y_train)
38
-
39
- # Make a prediction on the test data
40
- predictions = model.predict(X_test)
41
-
42
- # Create a DataFrame to store the true and predicted values
43
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
-
45
- overall_results.append(result_df)
46
-
47
- df_results = pd.concat(overall_results)
48
- # model.save_model('model_lr.bin')
49
- # Return the true and predicted values, and fitted model
50
- return df_results, model
51
-
52
- model_cols = [
53
- 'BigNewsDay',
54
- 'Quarter',
55
- 'Perf5Day',
56
- 'Perf5Day_n1',
57
- 'DaysGreen',
58
- 'DaysRed',
59
- 'CurrentHigh30toClose',
60
- 'CurrentLow30toClose',
61
- 'CurrentClose30toClose',
62
- 'CurrentRange30',
63
- 'GapFill30',
64
- 'CurrentGap',
65
- 'RangePct',
66
- 'RangePct_n1',
67
- 'RangePct_n2',
68
- 'OHLC4_VIX',
69
- 'OHLC4_VIX_n1',
70
- 'OHLC4_VIX_n2',
71
- 'OpenL1',
72
- 'OpenL2',
73
- 'OpenH1',
74
- 'OpenH2',
75
- 'L1TouchPct',
76
- 'L2TouchPct',
77
- 'H1TouchPct',
78
- 'H2TouchPct',
79
- 'L1BreakPct',
80
- 'L2BreakPct',
81
- 'H1BreakPct',
82
- 'H2BreakPct',
83
- 'GreenProbas',
84
- # 'GapFillGreenProba'
85
- ]
86
-
87
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
88
-
89
- # Create run the regression model to get its target
90
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
91
- # joblib.dump(model1, 'model1.bin')
92
-
93
- # Merge the result df back on the df for feeding into the classifier
94
- for_merge = res[['Predicted']]
95
- for_merge.columns = ['RegrModelOut']
96
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
97
- df = df.merge(for_merge, left_index=True, right_index=True)
98
- df = df.drop(columns=[target_column_regr])
99
- df = df[model_cols + ['RegrModelOut', target_column_clf]]
100
-
101
- df[target_column_clf] = df[target_column_clf].astype(bool)
102
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
103
-
104
- # Create an XGBRegressor model
105
- # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
106
- model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
107
- # model = linear_model.LogisticRegression(max_iter=1500)
108
-
109
- overall_results = []
110
- # Iterate over the rows in the DataFrame, one step at a time
111
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
112
- # Split the data into training and test sets
113
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
114
- y_train = df[target_column_clf].iloc[:i]
115
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
116
- y_test = df[target_column_clf].iloc[i:i+num_periods]
117
-
118
- # Fit the model to the training data
119
- model2.fit(X_train, y_train)
120
-
121
- # Make a prediction on the test data
122
- predictions = model2.predict_proba(X_test)[:,-1]
123
-
124
- # Create a DataFrame to store the true and predicted values
125
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
126
-
127
- overall_results.append(result_df)
128
-
129
- df_results = pd.concat(overall_results)
130
- # model1.save_model('model_ensemble.bin')
131
- # joblib.dump(model2, 'model2.bin')
132
- # Return the true and predicted values, and fitted model
133
- return df_results, model1, model2
134
-
135
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
136
- regr_pred = trained_reg_model.predict(df)
137
- regr_pred = regr_pred > 0
138
- new_df = df.copy()
139
- new_df['RegrModelOut'] = regr_pred
140
- clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
141
- return clf_pred_proba
142
-
143
- def get_data():
144
- # f = open('settings.json')
145
- # j = json.load(f)
146
- # API_KEY_FRED = j["API_KEY_FRED"]
147
-
148
- API_KEY_FRED = os.getenv('API_KEY_FRED')
149
-
150
- def parse_release_dates(release_id: str) -> List[str]:
151
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
152
- r = requests.get(release_dates_url)
153
- text = r.text
154
- soup = BeautifulSoup(text, 'xml')
155
- dates = []
156
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
157
- dates.append(release_date_tag.text)
158
- return dates
159
-
160
- def parse_release_dates_obs(series_id: str) -> List[str]:
161
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
162
- r = requests.get(obs_url)
163
- text = r.text
164
- soup = BeautifulSoup(text, 'xml')
165
- observations = []
166
- for observation_tag in soup.find_all('observation'):
167
- date = observation_tag.get('date')
168
- value = observation_tag.get('value')
169
- observations.append((date, value))
170
- return observations
171
-
172
- econ_dfs = {}
173
-
174
- econ_tickers = [
175
- 'WALCL',
176
- 'NFCI',
177
- 'WRESBAL'
178
- ]
179
-
180
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
181
- # p = parse_release_dates_obs(et)
182
- # df = pd.DataFrame(columns = ['ds',et], data = p)
183
- df = pdr.get_data_fred(et)
184
- df.index = df.index.rename('ds')
185
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
186
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
187
- econ_dfs[et] = df
188
-
189
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
190
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
191
-
192
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
193
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
194
-
195
- release_ids = [
196
- "10", # "Consumer Price Index"
197
- "46", # "Producer Price Index"
198
- "50", # "Employment Situation"
199
- "53", # "Gross Domestic Product"
200
- "103", # "Discount Rate Meeting Minutes"
201
- "180", # "Unemployment Insurance Weekly Claims Report"
202
- "194", # "ADP National Employment Report"
203
- "323" # "Trimmed Mean PCE Inflation Rate"
204
- ]
205
-
206
- release_names = [
207
- "CPI",
208
- "PPI",
209
- "NFP",
210
- "GDP",
211
- "FOMC",
212
- "UNEMP",
213
- "ADP",
214
- "PCE"
215
- ]
216
-
217
- releases = {}
218
-
219
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
220
- releases[rid] = {}
221
- releases[rid]['dates'] = parse_release_dates(rid)
222
- releases[rid]['name'] = n
223
-
224
- # Create a DF that has all dates with the name of the col as 1
225
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
226
- # This column serves as the true/false indicator of whether there was economic data released that day.
227
- for rid in tqdm(release_ids, desc='Making indicators'):
228
- releases[rid]['df'] = pd.DataFrame(
229
- index=releases[rid]['dates'],
230
- data={
231
- releases[rid]['name']: 1
232
- })
233
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
234
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
235
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
236
-
237
- vix = yf.Ticker('^VIX')
238
- spx = yf.Ticker('^GSPC')
239
-
240
-
241
- # Pull in data
242
- data = load_dataset("boomsss/spx_intra", split='train')
243
-
244
- rows = [d['text'] for d in data]
245
- rows = [x.split(',') for x in rows]
246
-
247
- fr = pd.DataFrame(columns=[
248
- 'Datetime','Open','High','Low','Close'
249
- ], data = rows)
250
-
251
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
252
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
253
- fr = fr.set_index('Datetime')
254
- fr['Open'] = pd.to_numeric(fr['Open'])
255
- fr['High'] = pd.to_numeric(fr['High'])
256
- fr['Low'] = pd.to_numeric(fr['Low'])
257
- fr['Close'] = pd.to_numeric(fr['Close'])
258
-
259
- # Get incremental date
260
- last_date = fr.index.date[-1]
261
- last_date = last_date + datetime.timedelta(days=1)
262
- # Get incremental data
263
- spx1 = yf.Ticker('^GSPC')
264
- yfp = spx1.history(start=last_date, interval='30m')
265
-
266
- if len(yfp) > 0:
267
- # Concat current and incremental
268
- df_30m = pd.concat([fr, yfp])
269
- else:
270
- df_30m = fr.copy()
271
-
272
- # Get the first 30 minute bar
273
- df_30m = df_30m.reset_index()
274
- df_30m['Datetime'] = df_30m['Datetime'].dt.date
275
- df_30m = df_30m.groupby('Datetime').head(3)
276
- df_30m = df_30m.set_index('Datetime',drop=True)
277
- # Rename the columns
278
- df_30m = df_30m[['Open','High','Low','Close']]
279
-
280
- opens_1h = df_30m.groupby('Datetime')['Open'].head(1)
281
- highs_1h = df_30m.groupby('Datetime')['High'].max()
282
- lows_1h = df_30m.groupby('Datetime')['Low'].min()
283
- closes_1h = df_30m.groupby('Datetime')['Close'].tail(1)
284
-
285
- df_1h = pd.DataFrame(index=df_30m.index.unique())
286
- df_1h['Open'] = opens_1h
287
- df_1h['High'] = highs_1h
288
- df_1h['Low'] = lows_1h
289
- df_1h['Close'] = closes_1h
290
-
291
- df_1h.columns = ['Open30','High30','Low30','Close30']
292
-
293
- prices_vix = vix.history(start='2018-07-01', interval='1d')
294
- prices_spx = spx.history(start='2018-07-01', interval='1d')
295
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
296
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
297
- prices_spx.index = prices_spx['index']
298
- prices_spx = prices_spx.drop(columns='index')
299
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
300
-
301
-
302
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
303
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
304
- prices_vix.index = prices_vix['index']
305
- prices_vix = prices_vix.drop(columns='index')
306
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
307
-
308
-
309
- data = prices_spx.merge(df_1h, left_index=True, right_index=True)
310
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
311
-
312
- # Features
313
- data['PrevClose'] = data['Close'].shift(1)
314
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
315
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
316
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
317
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
318
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
319
-
320
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
321
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
322
-
323
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
324
- data['RangePct'] = data['Range'] / data['Close']
325
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
326
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
327
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
328
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
329
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
330
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
331
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
332
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
333
- data['RangePct_n1'] = data['RangePct'].shift(1)
334
- data['RangePct_n2'] = data['RangePct'].shift(2)
335
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
336
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
337
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
338
- data['CurrentGapHist'] = data['CurrentGap'].copy()
339
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
340
- data['DayOfWeek'] = pd.to_datetime(data.index)
341
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
342
-
343
- # Intraday features
344
- data['CurrentHigh30'] = data['High30'].shift(-1)
345
- data['CurrentLow30'] = data['Low30'].shift(-1)
346
- data['CurrentClose30'] = data['Close30'].shift(-1)
347
- data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
348
-
349
-
350
- # Open to High
351
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
352
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
353
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
354
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
355
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
356
-
357
- # Target -- the next day's low
358
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
359
- data['Target'] = data['Target'].shift(-1)
360
- # data['Target'] = data['RangePct'].shift(-1)
361
-
362
- # Target for clf -- whether tomorrow will close above or below today's close
363
- data['Target_clf'] = data['Close'] > data['PrevClose']
364
- data['Target_clf'] = data['Target_clf'].shift(-1)
365
- data['DayOfWeek'] = pd.to_datetime(data.index)
366
- data['Quarter'] = data['DayOfWeek'].dt.quarter
367
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
368
-
369
- # Calculate up
370
- data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
371
-
372
- # Calculate upSD
373
- data['upSD'] = data['up'].rolling(30).std(ddof=0)
374
-
375
- # Calculate aveUp
376
- data['aveUp'] = data['up'].rolling(30).mean()
377
- data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
378
- data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
379
- data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
380
- data['downSD'] = data['down'].rolling(30).std(ddof=0)
381
- data['aveDown'] = data['down'].rolling(30).mean()
382
- data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
383
- data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
384
-
385
- data = data.assign(
386
- L1Touch = lambda x: x['Low'] < x['L1'],
387
- L2Touch = lambda x: x['Low'] < x['L2'],
388
- H1Touch = lambda x: x['High'] > x['H1'],
389
- H2Touch = lambda x: x['High'] > x['H2'],
390
- L1Break = lambda x: x['Close'] < x['L1'],
391
- L2Break = lambda x: x['Close'] < x['L2'],
392
- H1Break = lambda x: x['Close'] > x['H1'],
393
- H2Break = lambda x: x['Close'] > x['H2'],
394
- OpenL1 = lambda x: x['Open'] / x['L1'],
395
- OpenL2 = lambda x: x['Open'] / x['L2'],
396
- OpenH1 = lambda x: x['Open'] / x['H1'],
397
- OpenH2 = lambda x: x['Open'] / x['H2']
398
- )
399
-
400
- level_cols = [
401
- 'L1Touch',
402
- 'L2Touch',
403
- 'H1Touch',
404
- 'H2Touch',
405
- 'L1Break',
406
- 'L2Break',
407
- 'H1Break',
408
- 'H2Break'
409
- ]
410
-
411
- for col in level_cols:
412
- data[col+'Pct'] = data[col].rolling(100).mean()
413
- data[col+'Pct'] = data[col+'Pct'].shift(-1)
414
-
415
- def get_quintiles(df, col_name, q):
416
- return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
417
-
418
- probas = []
419
- for i, pct in enumerate(data['CurrentClose30toClose']):
420
- try:
421
- df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 5)
422
- for q in df_q.index:
423
- if q.left <= pct <= q.right:
424
- p = df_q[q]
425
- except:
426
- p = None
427
-
428
- probas.append(p)
429
-
430
- # gapfills = []
431
- # for i, pct in enumerate(data['CurrentGap']):
432
- # try:
433
- # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
434
- # for q in df_q.index:
435
- # if q.left <= pct <= q.right:
436
- # p = df_q[q]
437
- # except:
438
- # p = None
439
-
440
- # gapfills.append(p)
441
-
442
- data['GreenProbas'] = probas
443
- # data['GapFillGreenProba'] = gapfills
444
-
445
- for rid in tqdm(release_ids, desc='Merging econ data'):
446
- # Get the name of the release
447
- n = releases[rid]['name']
448
- # Merge the corresponding DF of the release
449
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
450
- # Create a column that shifts the value in the merged column up by 1
451
- data[f'{n}_shift'] = data[n].shift(-1)
452
- # Fill the rest with zeroes
453
- data[n] = data[n].fillna(0)
454
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
455
-
456
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
457
-
458
- def cumul_sum(col):
459
- nums = []
460
- s = 0
461
- for x in col:
462
- if x == 1:
463
- s += 1
464
- elif x == 0:
465
- s = 0
466
- nums.append(s)
467
- return nums
468
-
469
- consec_green = cumul_sum(data['GreenDay'].values)
470
- consec_red = cumul_sum(data['RedDay'].values)
471
-
472
- data['DaysGreen'] = consec_green
473
- data['DaysRed'] = consec_red
474
-
475
- final_row = data.index[-2]
476
-
477
- exp_row = data.index[-1]
478
-
479
- df_final = data.loc[:final_row, model_cols + ['Target','Target_clf']]
480
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
481
- return data, df_final, final_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_day.py DELETED
@@ -1,434 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import json
7
- import requests
8
- from bs4 import BeautifulSoup
9
- from typing import List
10
- import xgboost as xgb
11
- from tqdm import tqdm
12
- from sklearn import linear_model
13
- import joblib
14
- import os
15
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
16
- import datetime
17
- from pandas.tseries.offsets import BDay
18
- import lightgbm as lgb
19
-
20
- def walk_forward_validation(df, target_column, num_training_rows, num_periods):
21
-
22
- # Create an XGBRegressor model
23
- # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
24
- model = linear_model.LinearRegression()
25
-
26
- overall_results = []
27
- # Iterate over the rows in the DataFrame, one step at a time
28
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
29
- # Split the data into training and test sets
30
- X_train = df.drop(target_column, axis=1).iloc[:i]
31
- y_train = df[target_column].iloc[:i]
32
- X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
33
- y_test = df[target_column].iloc[i:i+num_periods]
34
-
35
- # Fit the model to the training data
36
- model.fit(X_train, y_train)
37
-
38
- # Make a prediction on the test data
39
- predictions = model.predict(X_test)
40
-
41
- # Create a DataFrame to store the true and predicted values
42
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
43
-
44
- overall_results.append(result_df)
45
-
46
- df_results = pd.concat(overall_results)
47
- # model.save_model('model_lr.bin')
48
- # Return the true and predicted values, and fitted model
49
- return df_results, model
50
-
51
- model_cols = [
52
- 'BigNewsDay',
53
- 'Quarter',
54
- 'Perf5Day',
55
- 'Perf5Day_n1',
56
- 'DaysGreen',
57
- 'DaysRed',
58
- 'CurrentGap',
59
- 'RangePct',
60
- 'RangePct_n1',
61
- 'RangePct_n2',
62
- 'OHLC4_VIX',
63
- 'OHLC4_VIX_n1',
64
- 'OHLC4_VIX_n2',
65
- 'VIXOpen',
66
- 'VVIXOpen',
67
- 'OpenL1',
68
- 'OpenL2',
69
- 'OpenH1',
70
- 'OpenH2',
71
- 'L1TouchPct',
72
- 'L2TouchPct',
73
- 'H1TouchPct',
74
- 'H2TouchPct',
75
- 'L1BreakPct',
76
- 'L2BreakPct',
77
- 'H1BreakPct',
78
- 'H2BreakPct',
79
- 'H1BreakTouchPct',
80
- 'H2BreakTouchPct',
81
- 'L1BreakTouchPct',
82
- 'L2BreakTouchPct'
83
- ]
84
-
85
- def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
86
-
87
- # Create run the regression model to get its target
88
- res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
89
- # joblib.dump(model1, 'model1.bin')
90
-
91
- # Merge the result df back on the df for feeding into the classifier
92
- for_merge = res[['Predicted']]
93
- for_merge.columns = ['RegrModelOut']
94
- for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
95
- df = df.merge(for_merge, left_index=True, right_index=True)
96
- df = df.drop(columns=[target_column_regr])
97
- df = df[model_cols + ['RegrModelOut', target_column_clf]]
98
-
99
- df[target_column_clf] = df[target_column_clf].astype(bool)
100
- df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
101
-
102
- # Create an XGBRegressor model
103
- # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
104
- model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
105
- # model = linear_model.LogisticRegression(max_iter=1500)
106
-
107
- overall_results = []
108
- # Iterate over the rows in the DataFrame, one step at a time
109
- for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
110
- # Split the data into training and test sets
111
- X_train = df.drop(target_column_clf, axis=1).iloc[:i]
112
- y_train = df[target_column_clf].iloc[:i]
113
- X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
114
- y_test = df[target_column_clf].iloc[i:i+num_periods]
115
-
116
- # Fit the model to the training data
117
- model2.fit(X_train, y_train)
118
-
119
- # Make a prediction on the test data
120
- predictions = model2.predict_proba(X_test)[:,-1]
121
-
122
- # Create a DataFrame to store the true and predicted values
123
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
124
-
125
- overall_results.append(result_df)
126
-
127
- df_results = pd.concat(overall_results)
128
-
129
- # Calibrate Probabilities
130
- def get_quantiles(df, col_name, q):
131
- return df.groupby(pd.cut(df[col_name], q))['True'].mean()
132
-
133
- greenprobas = []
134
- meanprobas = []
135
- for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
136
- try:
137
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
138
- for q in df_q.index:
139
- if q.left <= pct <= q.right:
140
- p = df_q[q]
141
- c = (q.left + q.right) / 2
142
- except:
143
- p = None
144
- c = None
145
-
146
- greenprobas.append(p)
147
- meanprobas.append(c)
148
-
149
- df_results['CalibPredicted'] = greenprobas
150
-
151
- return df_results, model1, model2
152
-
153
- def seq_predict_proba(df, trained_reg_model, trained_clf_model):
154
- regr_pred = trained_reg_model.predict(df)
155
- regr_pred = regr_pred > 0
156
- new_df = df.copy()
157
- new_df['RegrModelOut'] = regr_pred
158
- clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
159
- return clf_pred_proba
160
-
161
- def get_data():
162
- # f = open('settings.json')
163
- # j = json.load(f)
164
- # API_KEY_FRED = j["API_KEY_FRED"]
165
-
166
- API_KEY_FRED = os.getenv('API_KEY_FRED')
167
-
168
- def parse_release_dates(release_id: str) -> List[str]:
169
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
170
- r = requests.get(release_dates_url)
171
- text = r.text
172
- soup = BeautifulSoup(text, 'xml')
173
- dates = []
174
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
175
- dates.append(release_date_tag.text)
176
- return dates
177
-
178
- def parse_release_dates_obs(series_id: str) -> List[str]:
179
- obs_url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
180
- r = requests.get(obs_url)
181
- text = r.text
182
- soup = BeautifulSoup(text, 'xml')
183
- observations = []
184
- for observation_tag in soup.find_all('observation'):
185
- date = observation_tag.get('date')
186
- value = observation_tag.get('value')
187
- observations.append((date, value))
188
- return observations
189
-
190
- econ_dfs = {}
191
-
192
- econ_tickers = [
193
- 'WALCL',
194
- 'NFCI',
195
- 'WRESBAL'
196
- ]
197
-
198
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
199
- # p = parse_release_dates_obs(et)
200
- # df = pd.DataFrame(columns = ['ds',et], data = p)
201
- df = pdr.get_data_fred(et)
202
- df.index = df.index.rename('ds')
203
- # df.index = pd.to_datetime(df.index.rename('ds')).dt.tz_localize(None)
204
- # df['ds'] = pd.to_datetime(df['ds']).dt.tz_localize(None)
205
- econ_dfs[et] = df
206
-
207
- # walcl = pd.DataFrame(columns = ['ds','WALCL'], data = p)
208
- # walcl['ds'] = pd.to_datetime(walcl['ds']).dt.tz_localize(None)
209
-
210
- # nfci = pd.DataFrame(columns = ['ds','NFCI'], data = p2)
211
- # nfci['ds'] = pd.to_datetime(nfci['ds']).dt.tz_localize(None)
212
-
213
- release_ids = [
214
- "10", # "Consumer Price Index"
215
- "46", # "Producer Price Index"
216
- "50", # "Employment Situation"
217
- "53", # "Gross Domestic Product"
218
- "103", # "Discount Rate Meeting Minutes"
219
- "180", # "Unemployment Insurance Weekly Claims Report"
220
- "194", # "ADP National Employment Report"
221
- "323" # "Trimmed Mean PCE Inflation Rate"
222
- ]
223
-
224
- release_names = [
225
- "CPI",
226
- "PPI",
227
- "NFP",
228
- "GDP",
229
- "FOMC",
230
- "UNEMP",
231
- "ADP",
232
- "PCE"
233
- ]
234
-
235
- releases = {}
236
-
237
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
238
- releases[rid] = {}
239
- releases[rid]['dates'] = parse_release_dates(rid)
240
- releases[rid]['name'] = n
241
-
242
- # Create a DF that has all dates with the name of the col as 1
243
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
244
- # This column serves as the true/false indicator of whether there was economic data released that day.
245
- for rid in tqdm(release_ids, desc='Making indicators'):
246
- releases[rid]['df'] = pd.DataFrame(
247
- index=releases[rid]['dates'],
248
- data={
249
- releases[rid]['name']: 1
250
- })
251
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
252
- # releases[rid]['df']['ds'] = pd.to_datetime(releases[rid]['df']['ds']).dt.tz_localize(None)
253
- # releases[rid]['df'] = releases[rid]['df'].set_index('ds')
254
-
255
- vix = yf.Ticker('^VIX')
256
- vvix = yf.Ticker('^VVIX')
257
- spx = yf.Ticker('^GSPC')
258
-
259
- prices_vix = vix.history(start='2018-07-01', interval='1d')
260
- prices_spx = spx.history(start='2018-07-01', interval='1d')
261
- prices_vvix = vvix.history(start='2018-07-01', interval='1d')
262
-
263
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
264
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
265
- prices_spx.index = prices_spx['index']
266
- prices_spx = prices_spx.drop(columns='index')
267
-
268
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
269
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
270
- prices_vix.index = prices_vix['index']
271
- prices_vix = prices_vix.drop(columns='index')
272
-
273
- prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
274
- prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
275
- prices_vvix.index = prices_vvix['index']
276
- prices_vvix = prices_vvix.drop(columns='index')
277
-
278
- data = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
279
- data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
280
- data.index = pd.DatetimeIndex(data.index)
281
-
282
- # Features
283
- data['PrevClose'] = data['Close'].shift(1)
284
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
285
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1).astype(bool)
286
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
287
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
288
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
289
- data['VIX5Day_n1'] = data['VIX5Day'].shift(1).astype(bool)
290
- data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1)
291
- data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1)
292
- data['VIXOpen'] = data['VIXOpen'].astype(bool)
293
- data['VVIXOpen'] = data['VVIXOpen'].astype(bool)
294
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1)
295
- data['RangePct'] = data['Range'] / data['Close']
296
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
297
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
298
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
299
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
300
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1).astype(float)
301
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(2).astype(float)
302
- data['RangePct_n1'] = data['RangePct'].shift(1)
303
- data['RangePct_n2'] = data['RangePct'].shift(2)
304
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
305
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
306
- data['CurrentGap'] = ((data['Open'] - data['PrevClose']) / data['PrevClose']).shift(-1)
307
- data['DayOfWeek'] = pd.to_datetime(data.index)
308
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
309
- data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
310
- data['upSD'] = data['up'].rolling(30).std(ddof=0)
311
- data['aveUp'] = data['up'].rolling(30).mean()
312
- data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
313
- data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
314
- data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
315
- data['downSD'] = data['down'].rolling(30).std(ddof=0)
316
- data['aveDown'] = data['down'].rolling(30).mean()
317
- data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
318
- data['L2'] = data['Open'] - ((data['aveDown'] + data['upSD']) / 100) * data['Open']
319
- data['L1Touch'] = data['Low'] < data['L1']
320
- data['L2Touch'] = data['Low'] < data['L2']
321
- data['H1Touch'] = data['High'] > data['H1']
322
- data['H2Touch'] = data['High'] > data['H2']
323
- data['L1Break'] = data['Close'] < data['L1']
324
- data['L2Break'] = data['Close'] < data['L2']
325
- data['H1Break'] = data['Close'] > data['H1']
326
- data['H2Break'] = data['Close'] > data['H2']
327
- data['OpenL1'] = data['Open'] / data['L1']
328
- data['OpenL2'] = data['Open'] / data['L2']
329
- data['OpenH1'] = data['Open'] / data['H1']
330
- data['OpenH2'] = data['Open'] / data['H2']
331
-
332
- level_cols = [
333
- 'L1Touch',
334
- 'L2Touch',
335
- 'H1Touch',
336
- 'H2Touch',
337
- 'L1Break',
338
- 'L2Break',
339
- 'H1Break',
340
- 'H2Break'
341
- ]
342
-
343
- for col in level_cols:
344
- data[col+'Pct'] = data[col].rolling(100).mean()
345
-
346
- data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
347
- data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
348
- data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
349
- data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
350
-
351
- # Target -- the next day's low
352
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
353
- data['Target'] = data['Target'].shift(-1)
354
- # data['Target'] = data['RangePct'].shift(-1)
355
-
356
- # Target for clf -- whether tomorrow will close above or below today's close
357
- data['Target_clf'] = data['Close'] > data['PrevClose']
358
- data['Target_clf'] = data['Target_clf'].shift(-1)
359
- data['DayOfWeek'] = pd.to_datetime(data.index)
360
- data['Quarter'] = data['DayOfWeek'].dt.quarter
361
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
362
-
363
- for rid in tqdm(release_ids, desc='Merging econ data'):
364
- # Get the name of the release
365
- n = releases[rid]['name']
366
- # Merge the corresponding DF of the release
367
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
368
- # Create a column that shifts the value in the merged column up by 1
369
- data[f'{n}_shift'] = data[n].shift(-1)
370
- # Fill the rest with zeroes
371
- data[n] = data[n].fillna(0)
372
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
373
-
374
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
375
-
376
- def cumul_sum(col):
377
- nums = []
378
- s = 0
379
- for x in col:
380
- if x == 1:
381
- s += 1
382
- elif x == 0:
383
- s = 0
384
- nums.append(s)
385
- return nums
386
-
387
- consec_green = cumul_sum(data['GreenDay'].values)
388
- consec_red = cumul_sum(data['RedDay'].values)
389
-
390
- data['DaysGreen'] = consec_green
391
- data['DaysRed'] = consec_red
392
-
393
- final_row = data.index[-2]
394
-
395
- exp_row = data.index[-1]
396
-
397
- df_final = data.loc[:final_row,
398
- [
399
- 'BigNewsDay',
400
- 'Quarter',
401
- 'Perf5Day',
402
- 'Perf5Day_n1',
403
- 'DaysGreen',
404
- 'DaysRed',
405
- 'CurrentGap',
406
- 'RangePct',
407
- 'RangePct_n1',
408
- 'RangePct_n2',
409
- 'OHLC4_VIX',
410
- 'OHLC4_VIX_n1',
411
- 'OHLC4_VIX_n2',
412
- 'VIXOpen',
413
- 'VVIXOpen',
414
- 'OpenL1',
415
- 'OpenL2',
416
- 'OpenH1',
417
- 'OpenH2',
418
- 'L1TouchPct',
419
- 'L2TouchPct',
420
- 'H1TouchPct',
421
- 'H2TouchPct',
422
- 'L1BreakPct',
423
- 'L2BreakPct',
424
- 'H1BreakPct',
425
- 'H2BreakPct',
426
- 'H1BreakTouchPct',
427
- 'H2BreakTouchPct',
428
- 'L1BreakTouchPct',
429
- 'L2BreakTouchPct',
430
- 'Target',
431
- 'Target_clf'
432
- ]]
433
- df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
434
- return data, df_final, final_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_intra.py DELETED
@@ -1,531 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import pandas_datareader as pdr
4
- import numpy as np
5
- import yfinance as yf
6
- import requests
7
- from bs4 import BeautifulSoup
8
- from typing import List
9
- from tqdm import tqdm
10
- import os
11
- import datetime
12
- from pandas.tseries.offsets import BDay
13
- from datasets import load_dataset
14
- import lightgbm as lgb
15
- from sklearn.model_selection import TimeSeriesSplit
16
- import json
17
-
18
- data_start_date = '2018-07-01'
19
-
20
- model_cols = [
21
- 'BigNewsDay',
22
- 'Quarter',
23
- 'Perf5Day',
24
- 'Perf5Day_n1',
25
- 'DaysGreen',
26
- 'DaysRed',
27
- 'CurrentHigh30toClose',
28
- 'CurrentLow30toClose',
29
- 'CurrentClose30toClose',
30
- 'CurrentRange30',
31
- 'GapFill30',
32
- 'CurrentGap',
33
- 'RangePct',
34
- 'RangePct_n1',
35
- 'RangePct_n2',
36
- 'OHLC4_VIX',
37
- 'OHLC4_VIX_n1',
38
- 'OHLC4_VIX_n2',
39
- 'OHLC4_Current_Trend',
40
- 'OHLC4_Trend',
41
- 'CurrentVIXTrend',
42
- 'SPX30IntraPerf',
43
- 'VIX30IntraPerf',
44
- 'VVIX30IntraPerf',
45
- # 'OpenL1',
46
- # 'OpenL2',
47
- # 'OpenH1',
48
- # 'OpenH2',
49
- 'L1TouchPct',
50
- 'L2TouchPct',
51
- 'H1TouchPct',
52
- 'H2TouchPct',
53
- 'L1BreakPct',
54
- 'L2BreakPct',
55
- 'H1BreakPct',
56
- 'H2BreakPct',
57
- 'GreenProbas',
58
- 'H1BreakTouchPct',
59
- 'H2BreakTouchPct',
60
- 'L1BreakTouchPct',
61
- 'L2BreakTouchPct',
62
- 'H1BreakH2TouchPct',
63
- 'L1BreakL2TouchPct',
64
- 'H1TouchGreenPct',
65
- 'L1TouchRedPct'
66
- # 'GapFillGreenProba'
67
- ]
68
-
69
- # If the dataset is gated/private, make sure you have run huggingface-cli login
70
- def walk_forward_validation(df, target_column, num_periods):
71
-
72
- df = df[model_cols + [target_column]]
73
- df[target_column] = df[target_column].astype(bool)
74
-
75
- # Model
76
- # model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
77
-
78
- tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
79
-
80
- overall_results = []
81
- # Iterate over the rows in the DataFrame, one step at a time
82
- # Split the time series data using TimeSeriesSplit
83
- for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
84
- # Extract the training and testing data for the current split
85
- X_train = df.drop(target_column, axis=1).iloc[train_index]
86
- y_train = df[target_column].iloc[train_index]
87
- X_test = df.drop(target_column, axis=1).iloc[test_index]
88
- y_test = df[target_column].iloc[test_index]
89
-
90
- y_train = y_train.astype(bool)
91
- model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
92
- model.fit(X_train, y_train)
93
- # Make a prediction on the test data
94
- predictions = model.predict_proba(X_test)[:,-1]
95
-
96
- # Create a DataFrame to store the true and predicted values
97
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
98
- overall_results.append(result_df)
99
-
100
- df_results = pd.concat(overall_results)
101
-
102
- # Calibrate Probabilities
103
- def get_quantiles(df, col_name, q):
104
- return df.groupby(pd.cut(df[col_name], q))['True'].mean()
105
-
106
- greenprobas = []
107
- for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
108
- try:
109
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
110
- for q in df_q.index:
111
- if q.left <= pct <= q.right:
112
- p = df_q[q]
113
- except:
114
- p = None
115
-
116
- greenprobas.append(p)
117
-
118
- df_results['CalibPredicted'] = greenprobas
119
-
120
- return df_results, model
121
-
122
- def seq_predict_proba(df, trained_clf_model):
123
- clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
124
- return clf_pred_proba
125
-
126
- def get_data(periods_30m = 1):
127
- # f = open('settings.json')
128
- # j = json.load(f)
129
- # API_KEY_FRED = j["API_KEY_FRED"]
130
-
131
- API_KEY_FRED = os.getenv('API_KEY_FRED')
132
-
133
- def parse_release_dates(release_id: str) -> List[str]:
134
- release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
135
- r = requests.get(release_dates_url)
136
- text = r.text
137
- soup = BeautifulSoup(text, 'xml')
138
- dates = []
139
- for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
140
- dates.append(release_date_tag.text)
141
- return dates
142
-
143
- econ_dfs = {}
144
-
145
- econ_tickers = [
146
- 'WALCL',
147
- 'NFCI',
148
- 'WRESBAL'
149
- ]
150
-
151
- for et in tqdm(econ_tickers, desc='getting econ tickers'):
152
- df = pdr.get_data_fred(et)
153
- df.index = df.index.rename('ds')
154
- econ_dfs[et] = df
155
-
156
- release_ids = [
157
- "10", # "Consumer Price Index"
158
- "46", # "Producer Price Index"
159
- "50", # "Employment Situation"
160
- "53", # "Gross Domestic Product"
161
- "103", # "Discount Rate Meeting Minutes"
162
- "180", # "Unemployment Insurance Weekly Claims Report"
163
- "194", # "ADP National Employment Report"
164
- "323" # "Trimmed Mean PCE Inflation Rate"
165
- ]
166
-
167
- release_names = [
168
- "CPI",
169
- "PPI",
170
- "NFP",
171
- "GDP",
172
- "FOMC",
173
- "UNEMP",
174
- "ADP",
175
- "PCE"
176
- ]
177
-
178
- releases = {}
179
-
180
- for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
181
- releases[rid] = {}
182
- releases[rid]['dates'] = parse_release_dates(rid)
183
- releases[rid]['name'] = n
184
-
185
- # Create a DF that has all dates with the name of the col as 1
186
- # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
187
- # This column serves as the true/false indicator of whether there was economic data released that day.
188
- for rid in tqdm(release_ids, desc='Making indicators'):
189
- releases[rid]['df'] = pd.DataFrame(
190
- index=releases[rid]['dates'],
191
- data={
192
- releases[rid]['name']: 1
193
- })
194
- releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
195
-
196
- vix = yf.Ticker('^VIX')
197
- vvix = yf.Ticker('^VVIX')
198
- spx = yf.Ticker('^GSPC')
199
-
200
- # Pull in data
201
- data_files = {"spx": "SPX_full_30min.txt", "vix": "VIX_full_30min.txt", "vvix":'VVIX_full_30min.txt'}
202
- data = load_dataset("boomsss/spx_intra", data_files=data_files)
203
- dfs = []
204
- for ticker in data.keys():
205
- rows = [d['text'] for d in data[ticker]]
206
- rows = [x.split(',') for x in rows]
207
-
208
- fr = pd.DataFrame(columns=[
209
- 'Datetime','Open','High','Low','Close'
210
- ], data = rows)
211
-
212
- fr['Datetime'] = pd.to_datetime(fr['Datetime'])
213
- fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
214
- fr = fr.set_index('Datetime')
215
- fr['Open'] = pd.to_numeric(fr['Open'])
216
- fr['High'] = pd.to_numeric(fr['High'])
217
- fr['Low'] = pd.to_numeric(fr['Low'])
218
- fr['Close'] = pd.to_numeric(fr['Close'])
219
- dfs.append(fr)
220
-
221
- df_30m = pd.concat(dfs, axis=1)
222
-
223
- df_30m.columns = [
224
- 'Open30',
225
- 'High30',
226
- 'Low30',
227
- 'Close30',
228
- 'Open_VIX30',
229
- 'High_VIX30',
230
- 'Low_VIX30',
231
- 'Close_VIX30',
232
- 'Open_VVIX30',
233
- 'High_VVIX30',
234
- 'Low_VVIX30',
235
- 'Close_VVIX30'
236
- ]
237
-
238
- # Get incremental date
239
- last_date = df_30m.index.date[-1]
240
- last_date = last_date + datetime.timedelta(days=1)
241
-
242
- # Get incremental data for each index
243
- spx1 = yf.Ticker('^GSPC')
244
- vix1 = yf.Ticker('^VIX')
245
- vvix1 = yf.Ticker('^VVIX')
246
- yfp = spx1.history(start=last_date, interval='30m')
247
- yf_vix = vix1.history(start=last_date, interval='30m')
248
- yf_vvix = vvix1.history(start=last_date, interval='30m')
249
-
250
- if len(yfp) > 0:
251
- # Convert indexes to EST if not already
252
- for _df in [yfp, yf_vix, yf_vvix]:
253
- if _df.index.tz.zone != 'America/New_York':
254
- _df['Datetime'] = pd.to_datetime(_df.index)
255
- _df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
256
- _df.set_index('Datetime', inplace=True)
257
- # Concat them
258
- df_inc = pd.concat([
259
- yfp[['Open','High','Low','Close']],
260
- yf_vix[['Open','High','Low','Close']],
261
- yf_vvix[['Open','High','Low','Close']]
262
- ], axis=1)
263
- df_inc.columns = df_30m.columns
264
- df_inc = df_inc.loc[
265
- (df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
266
- ]
267
- df_30m = pd.concat([df_30m, df_inc])
268
- else:
269
- df_30m = df_30m.copy()
270
-
271
- df_30m = df_30m.loc[
272
- (df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
273
- ]
274
- df_30m['dt'] = df_30m.index.date
275
- df_30m = df_30m.groupby('dt').head(periods_30m)
276
- df_30m = df_30m.set_index('dt',drop=True)
277
- df_30m.index.name = 'Datetime'
278
-
279
- df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
280
- df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
281
- df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
282
-
283
- opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
284
- highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
285
- lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
286
- closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
287
- spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
288
- vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
289
- vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
290
-
291
- df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
292
-
293
-
294
- prices_vix = vix.history(start=data_start_date, interval='1d')
295
- prices_vvix = vvix.history(start=data_start_date, interval='1d')
296
- prices_spx = spx.history(start=data_start_date, interval='1d')
297
-
298
- prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
299
- prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
300
- prices_spx.index = prices_spx['index']
301
- prices_spx = prices_spx.drop(columns='index')
302
- prices_spx.index = pd.DatetimeIndex(prices_spx.index)
303
-
304
- prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
305
- prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
306
- prices_vix.index = prices_vix['index']
307
- prices_vix = prices_vix.drop(columns='index')
308
- prices_vix.index = pd.DatetimeIndex(prices_vix.index)
309
-
310
- prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
311
- prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
312
- prices_vvix.index = prices_vvix['index']
313
- prices_vvix = prices_vvix.drop(columns='index')
314
- prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
315
-
316
- data = prices_spx.merge(df_intra, left_index=True, right_index=True)
317
- data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
318
- data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
319
-
320
- # Features
321
- data['PrevClose'] = data['Close'].shift(1)
322
- data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
323
- data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
324
- data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
325
- data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
326
- data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
327
-
328
- data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
329
- data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
330
-
331
- data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
332
- data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
333
-
334
- data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
335
- data['RangePct'] = data['Range'] / data['Close']
336
- data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
337
- data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
338
- data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
339
- data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
340
- data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
341
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
342
- data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
343
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
344
- data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
345
- data['RangePct_n1'] = data['RangePct'].shift(1)
346
- data['RangePct_n2'] = data['RangePct'].shift(2)
347
- data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
348
- data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
349
- data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
350
- data['CurrentGapHist'] = data['CurrentGap'].copy()
351
- data['CurrentGap'] = data['CurrentGap'].shift(-1)
352
- data['DayOfWeek'] = pd.to_datetime(data.index)
353
- data['DayOfWeek'] = data['DayOfWeek'].dt.day
354
-
355
- # Intraday features
356
- data['CurrentOpen30'] = data['Open30'].shift(-1)
357
- data['CurrentHigh30'] = data['High30'].shift(-1)
358
- data['CurrentLow30'] = data['Low30'].shift(-1)
359
- data['CurrentClose30'] = data['Close30'].shift(-1)
360
- data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
361
- data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
362
- data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
363
- data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
364
-
365
- data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
366
- data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
367
-
368
- data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
369
-
370
- # Open to High
371
- data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
372
- data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
373
- data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
374
- data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
375
- data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
376
-
377
- # Target -- the next day's low
378
- data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
379
- data['Target'] = data['Target'].shift(-1)
380
- # data['Target'] = data['RangePct'].shift(-1)
381
-
382
- # Target for clf -- whether tomorrow will close above or below today's close
383
- data['Target_clf'] = data['Close'] > data['PrevClose']
384
- data['Target_clf'] = data['Target_clf'].shift(-1)
385
- data['DayOfWeek'] = pd.to_datetime(data.index)
386
- data['Quarter'] = data['DayOfWeek'].dt.quarter
387
- data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
388
-
389
- # Calculate up
390
- data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
391
-
392
- # Calculate upSD
393
- data['upSD'] = data['up'].rolling(30).std(ddof=0)
394
-
395
- # Calculate aveUp
396
- data['aveUp'] = data['up'].rolling(30).mean()
397
- data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
398
- data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
399
- data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
400
- data['downSD'] = data['down'].rolling(30).std(ddof=0)
401
- data['aveDown'] = data['down'].rolling(30).mean()
402
- data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
403
- data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
404
-
405
- data = data.assign(
406
- L1Touch = lambda x: x['Low'] < x['L1'],
407
- L2Touch = lambda x: x['Low'] < x['L2'],
408
- H1Touch = lambda x: x['High'] > x['H1'],
409
- H2Touch = lambda x: x['High'] > x['H2'],
410
- L1Break = lambda x: x['Close'] < x['L1'],
411
- L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
412
- L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
413
- L2Break = lambda x: x['Close'] < x['L2'],
414
- H1Break = lambda x: x['Close'] > x['H1'],
415
- H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
416
- H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
417
- H2Break = lambda x: x['Close'] > x['H2'],
418
- OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
419
- OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
420
- OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
421
- OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
422
- CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
423
- CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
424
- CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
425
- CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
426
- )
427
-
428
- data['OpenL1'] = data['OpenL1'].shift(-1)
429
- data['OpenL2'] = data['OpenL2'].shift(-1)
430
- data['OpenH1'] = data['OpenH1'].shift(-1)
431
- data['OpenH2'] = data['OpenH2'].shift(-1)
432
- data['CloseL1'] = data['CloseL1'].shift(-1)
433
- data['CloseL2'] = data['CloseL2'].shift(-1)
434
- data['CloseH1'] = data['CloseH1'].shift(-1)
435
- data['CloseH2'] = data['CloseH2'].shift(-1)
436
-
437
- level_cols = [
438
- 'L1Touch',
439
- 'L2Touch',
440
- 'H1Touch',
441
- 'H2Touch',
442
- 'L1Break',
443
- 'L2Break',
444
- 'H1Break',
445
- 'H2Break'
446
- ]
447
-
448
- for col in level_cols:
449
- data[col+'Pct'] = data[col].rolling(100).mean()
450
- # data[col+'Pct'] = data[col+'Pct'].shift(-1)
451
-
452
- data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
453
- data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
454
- data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
455
- data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
456
- data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
457
- data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
458
-
459
- data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
460
- data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
461
-
462
- def get_quintiles(df, col_name, q):
463
- return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
464
-
465
- probas = []
466
- # Given the current price level
467
- for i, pct in enumerate(data['CurrentClose30toClose']):
468
- try:
469
- # Split
470
- df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
471
- for q in df_q.index:
472
- if q.left <= pct <= q.right:
473
- p = df_q[q]
474
- except:
475
- p = None
476
-
477
- probas.append(p)
478
-
479
- # gapfills = []
480
- # for i, pct in enumerate(data['CurrentGap']):
481
- # try:
482
- # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
483
- # for q in df_q.index:
484
- # if q.left <= pct <= q.right:
485
- # p = df_q[q]
486
- # except:
487
- # p = None
488
-
489
- # gapfills.append(p)
490
-
491
- data['GreenProbas'] = probas
492
- # data['GapFillGreenProba'] = gapfills
493
-
494
- for rid in tqdm(release_ids, desc='Merging econ data'):
495
- # Get the name of the release
496
- n = releases[rid]['name']
497
- # Merge the corresponding DF of the release
498
- data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
499
- # Create a column that shifts the value in the merged column up by 1
500
- data[f'{n}_shift'] = data[n].shift(-1)
501
- # Fill the rest with zeroes
502
- data[n] = data[n].fillna(0)
503
- data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
504
-
505
- data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
506
-
507
- def cumul_sum(col):
508
- nums = []
509
- s = 0
510
- for x in col:
511
- if x == 1:
512
- s += 1
513
- elif x == 0:
514
- s = 0
515
- nums.append(s)
516
- return nums
517
-
518
- consec_green = cumul_sum(data['GreenDay'].values)
519
- consec_red = cumul_sum(data['RedDay'].values)
520
-
521
- data['DaysGreen'] = consec_green
522
- data['DaysRed'] = consec_red
523
-
524
- final_row = data.index[-2]
525
-
526
- exp_row = data.index[-1]
527
-
528
- df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
529
- df_final = df_final.dropna(subset=['Target','Target_clf'])
530
- # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
531
- return data, df_final, final_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_intra_v2.py CHANGED
@@ -15,58 +15,70 @@ import lightgbm as lgb
15
  from sklearn.model_selection import TimeSeriesSplit
16
  from intraCols import model_cols
17
 
18
- # If the dataset is gated/private, make sure you have run huggingface-cli login
19
- def walk_forward_validation(df, target_column, num_periods):
20
 
21
  df = df[model_cols + [target_column]]
22
  df[target_column] = df[target_column].astype(bool)
23
 
24
- # Model
25
- # model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
26
-
27
  tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
28
 
29
- overall_results = []
30
- # Iterate over the rows in the DataFrame, one step at a time
31
- # Split the time series data using TimeSeriesSplit
32
- for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
33
- # Extract the training and testing data for the current split
34
- X_train = df.drop(target_column, axis=1).iloc[train_index]
35
- y_train = df[target_column].iloc[train_index]
36
- X_test = df.drop(target_column, axis=1).iloc[test_index]
37
- y_test = df[target_column].iloc[test_index]
38
-
39
- y_train = y_train.astype(bool)
40
- model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
41
- model.fit(X_train, y_train)
42
- # Make a prediction on the test data
43
- predictions = model.predict_proba(X_test)[:,-1]
44
-
45
- # Create a DataFrame to store the true and predicted values
46
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
47
- overall_results.append(result_df)
 
 
48
 
49
- df_results = pd.concat(overall_results)
 
 
50
 
51
- # Calibrate Probabilities
52
- def get_quantiles(df, col_name, q):
53
- return df.groupby(pd.cut(df[col_name], q))['True'].mean()
 
 
 
 
 
 
54
 
55
- greenprobas = []
56
- for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
57
- try:
58
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
59
- for q in df_q.index:
60
- if q.left <= pct <= q.right:
61
- p = df_q[q]
62
- except:
63
- p = None
64
 
65
- greenprobas.append(p)
66
 
67
- df_results['CalibPredicted'] = greenprobas
68
 
69
- return df_results, model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def seq_predict_proba(df, trained_clf_model):
72
  clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
 
15
  from sklearn.model_selection import TimeSeriesSplit
16
  from intraCols import model_cols
17
 
18
+ def walk_forward_validation(df, target_column, num_periods, mode='full'):
 
19
 
20
  df = df[model_cols + [target_column]]
21
  df[target_column] = df[target_column].astype(bool)
22
 
 
 
 
23
  tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
24
 
25
+ if mode == 'full':
26
+ overall_results = []
27
+ # Iterate over the rows in the DataFrame, one step at a time
28
+ # Split the time series data using TimeSeriesSplit
29
+ for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
30
+ # Extract the training and testing data for the current split
31
+ X_train = df.drop(target_column, axis=1).iloc[train_index]
32
+ y_train = df[target_column].iloc[train_index]
33
+ X_test = df.drop(target_column, axis=1).iloc[test_index]
34
+ y_test = df[target_column].iloc[test_index]
35
+
36
+ y_train = y_train.astype(bool)
37
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
38
+ model.fit(X_train, y_train)
39
+ # Make a prediction on the test data
40
+ predictions = model.predict_proba(X_test)[:,-1]
41
+
42
+ # Create a DataFrame to store the true and predicted values
43
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
44
+ overall_results.append(result_df)
45
+ df_results = pd.concat(overall_results)
46
 
47
+ # Calibrate Probabilities
48
+ def get_quantiles(df, col_name, q):
49
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
50
 
51
+ greenprobas = []
52
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
53
+ try:
54
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
55
+ for q in df_q.index:
56
+ if q.left <= pct <= q.right:
57
+ p = df_q[q]
58
+ except:
59
+ p = None
60
 
61
+ greenprobas.append(p)
 
 
 
 
 
 
 
 
62
 
63
+ df_results['CalibPredicted'] = greenprobas
64
 
65
+ return df_results, model
66
 
67
+ elif mode == 'single':
68
+ X_train = df.drop(target_column, axis=1).iloc[:-1]
69
+ y_train = df[target_column].iloc[:-1]
70
+ X_test = df.drop(target_column, axis=1).iloc[-1]
71
+ y_test = df[target_column].iloc[-1]
72
+ y_train = y_train.astype(bool)
73
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
74
+ model.fit(X_train, y_train)
75
+ predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
76
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
77
+
78
+ return result_df, model
79
+
80
+
81
+
82
 
83
  def seq_predict_proba(df, trained_clf_model):
84
  clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
requirements.txt CHANGED
@@ -19,4 +19,5 @@ huggingface_hub
19
  holidays
20
  pytz
21
  sqlalchemy<2.0
22
- mysqlclient
 
 
19
  holidays
20
  pytz
21
  sqlalchemy<2.0
22
+ mysqlclient
23
+ mplfinance
troubleshoot_day_model.ipynb CHANGED
The diff for this file is too large to render. See raw diff