boomsss commited on
Commit
86fa8c7
β€’
1 Parent(s): ef54379

making things faster

Browse files
Files changed (9) hide show
  1. app.py +6 -5
  2. ca-certificates.crt +0 -0
  3. dailyCols.py +33 -0
  4. getDailyData.py +278 -0
  5. getIntraData.py +140 -0
  6. intraCols.py +48 -0
  7. model_day_v2.py +118 -0
  8. model_intra_v2.py +73 -0
  9. requirements.txt +2 -1
app.py CHANGED
@@ -4,9 +4,10 @@ import numpy as np
4
  from sklearn.metrics import roc_auc_score, precision_score, recall_score
5
  from pandas.tseries.offsets import BDay
6
  import streamlit as st
7
- from datetime import datetime, time, timedelta
8
  import pytz
9
  import holidays
 
10
 
11
  st.set_page_config(
12
  page_title="Gameday $SPX",
@@ -121,12 +122,12 @@ with st.form("choose_model"):
121
 
122
  if option == '06:30':
123
 
124
- from model_day import *
125
 
126
  fname='performance_for_open_model.csv'
127
 
128
  my_bar.progress(0.33, 'Loading data...')
129
- data, df_final, final_row = get_data()
130
  # st.success("βœ… Historical data")
131
 
132
  my_bar.progress(0.66, "Training models...")
@@ -187,11 +188,11 @@ with st.form("choose_model"):
187
 
188
  else:
189
 
190
- from model_intra import *
191
  idx = times_list.index(option)
192
 
193
  my_bar.progress(0.33, 'Loading data...')
194
- data, df_final, final_row = get_data(idx)
195
  # st.success("βœ… Historical data")
196
 
197
  my_bar.progress(0.66, "Training models...")
 
4
  from sklearn.metrics import roc_auc_score, precision_score, recall_score
5
  from pandas.tseries.offsets import BDay
6
  import streamlit as st
7
+ from datetime import datetime
8
  import pytz
9
  import holidays
10
+ from getDailyData import get_daily
11
 
12
  st.set_page_config(
13
  page_title="Gameday $SPX",
 
122
 
123
  if option == '06:30':
124
 
125
+ from model_day_v2 import *
126
 
127
  fname='performance_for_open_model.csv'
128
 
129
  my_bar.progress(0.33, 'Loading data...')
130
+ data, df_final, final_row = get_daily()
131
  # st.success("βœ… Historical data")
132
 
133
  my_bar.progress(0.66, "Training models...")
 
188
 
189
  else:
190
 
191
+ from model_intra_v2 import *
192
  idx = times_list.index(option)
193
 
194
  my_bar.progress(0.33, 'Loading data...')
195
+ data, df_final, final_row = get_daily(mode='intra', periods_30m=idx)
196
  # st.success("βœ… Historical data")
197
 
198
  my_bar.progress(0.66, "Training models...")
ca-certificates.crt ADDED
The diff for this file is too large to render. See raw diff
 
dailyCols.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_cols = [
2
+ 'BigNewsDay',
3
+ 'Quarter',
4
+ 'Perf5Day',
5
+ 'Perf5Day_n1',
6
+ 'DaysGreen',
7
+ 'DaysRed',
8
+ 'CurrentGap',
9
+ 'RangePct',
10
+ 'RangePct_n1',
11
+ 'RangePct_n2',
12
+ 'OHLC4_VIX',
13
+ 'OHLC4_VIX_n1',
14
+ 'OHLC4_VIX_n2',
15
+ 'VIXOpen',
16
+ 'VVIXOpen',
17
+ 'OpenL1',
18
+ 'OpenL2',
19
+ 'OpenH1',
20
+ 'OpenH2',
21
+ 'L1TouchPct',
22
+ 'L2TouchPct',
23
+ 'H1TouchPct',
24
+ 'H2TouchPct',
25
+ 'L1BreakPct',
26
+ 'L2BreakPct',
27
+ 'H1BreakPct',
28
+ 'H2BreakPct',
29
+ 'H1BreakTouchPct',
30
+ 'H2BreakTouchPct',
31
+ 'L1BreakTouchPct',
32
+ 'L2BreakTouchPct'
33
+ ]
getDailyData.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pandas_datareader as pdr
3
+ import numpy as np
4
+ import yfinance as yf
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from typing import List
8
+ from tqdm import tqdm
9
+ import os
10
+ import datetime
11
+ import json
12
+ from sqlalchemy import create_engine
13
+
14
+ data_start_date = '2018-07-01'
15
+
16
+ def get_daily(mode='daily', periods_30m=None):
17
+ '''
18
+ Method to get daily data and create daily features. Optionally append intra data if specified.
19
+ `mode`: 'daily' or 'intra'.
20
+ `periods_30m`: How many 30m periods to bring in. Only specify if mode == 'intra'.
21
+ '''
22
+
23
+ vix = yf.Ticker('^VIX')
24
+ vvix = yf.Ticker('^VVIX')
25
+ spx = yf.Ticker('^GSPC')
26
+
27
+ prices_vix = vix.history(start=data_start_date, interval='1d')
28
+ prices_vvix = vvix.history(start=data_start_date, interval='1d')
29
+ prices_spx = spx.history(start=data_start_date, interval='1d')
30
+
31
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
32
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
33
+ prices_spx.index = prices_spx['index']
34
+ prices_spx = prices_spx.drop(columns='index')
35
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
36
+
37
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
38
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
39
+ prices_vix.index = prices_vix['index']
40
+ prices_vix = prices_vix.drop(columns='index')
41
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
42
+
43
+ prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
44
+ prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
45
+ prices_vvix.index = prices_vvix['index']
46
+ prices_vvix = prices_vvix.drop(columns='index')
47
+ prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
48
+
49
+ if mode == 'intra':
50
+ from getIntraData import get_intra
51
+ df_intra = get_intra(periods_30m)
52
+ data = prices_spx.merge(df_intra, left_index=True, right_index=True)
53
+ else:
54
+ data = prices_spx.copy()
55
+
56
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
57
+ data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
58
+
59
+ # Features
60
+ data['PrevClose'] = data['Close'].shift(1)
61
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
62
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
63
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
64
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
65
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
66
+
67
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
68
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
69
+
70
+ data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
71
+ data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
72
+
73
+ data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1)
74
+ data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1)
75
+ data['VIXOpen'] = data['VIXOpen'].astype(bool)
76
+ data['VVIXOpen'] = data['VVIXOpen'].astype(bool)
77
+
78
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
79
+ data['RangePct'] = data['Range'] / data['Close']
80
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
81
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
82
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
83
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
84
+ data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
85
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
86
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
87
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
88
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
89
+ data['RangePct_n1'] = data['RangePct'].shift(1)
90
+ data['RangePct_n2'] = data['RangePct'].shift(2)
91
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
92
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
93
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
94
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
95
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
96
+ data['DayOfWeek'] = pd.to_datetime(data.index)
97
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
98
+
99
+ # Target -- the next day's low
100
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
101
+ data['Target'] = data['Target'].shift(-1)
102
+ # data['Target'] = data['RangePct'].shift(-1)
103
+
104
+ # Target for clf -- whether tomorrow will close above or below today's close
105
+ data['Target_clf'] = data['Close'] > data['PrevClose']
106
+ data['Target_clf'] = data['Target_clf'].shift(-1)
107
+ data['DayOfWeek'] = pd.to_datetime(data.index)
108
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
109
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
110
+
111
+ # Calculate up
112
+ data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
113
+
114
+ # Calculate upSD
115
+ data['upSD'] = data['up'].rolling(30).std(ddof=0)
116
+
117
+ # Calculate aveUp
118
+ data['aveUp'] = data['up'].rolling(30).mean()
119
+ data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
120
+ data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
121
+ data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
122
+ data['downSD'] = data['down'].rolling(30).std(ddof=0)
123
+ data['aveDown'] = data['down'].rolling(30).mean()
124
+ data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
125
+ data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
126
+
127
+ data = data.assign(
128
+ L1Touch = lambda x: x['Low'] < x['L1'],
129
+ L2Touch = lambda x: x['Low'] < x['L2'],
130
+ H1Touch = lambda x: x['High'] > x['H1'],
131
+ H2Touch = lambda x: x['High'] > x['H2'],
132
+ L1Break = lambda x: x['Close'] < x['L1'],
133
+ L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
134
+ L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
135
+ L2Break = lambda x: x['Close'] < x['L2'],
136
+ H1Break = lambda x: x['Close'] > x['H1'],
137
+ H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
138
+ H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
139
+ H2Break = lambda x: x['Close'] > x['H2'],
140
+ OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
141
+ OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
142
+ OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
143
+ OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0)
144
+ )
145
+
146
+ data['OpenL1'] = data['OpenL1'].shift(-1)
147
+ data['OpenL2'] = data['OpenL2'].shift(-1)
148
+ data['OpenH1'] = data['OpenH1'].shift(-1)
149
+ data['OpenH2'] = data['OpenH2'].shift(-1)
150
+
151
+
152
+ level_cols = [
153
+ 'L1Touch',
154
+ 'L2Touch',
155
+ 'H1Touch',
156
+ 'H2Touch',
157
+ 'L1Break',
158
+ 'L2Break',
159
+ 'H1Break',
160
+ 'H2Break'
161
+ ]
162
+
163
+ for col in level_cols:
164
+ data[col+'Pct'] = data[col].rolling(100).mean()
165
+ # data[col+'Pct'] = data[col+'Pct'].shift(-1)
166
+
167
+ data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
168
+ data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
169
+ data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
170
+ data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
171
+ data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
172
+ data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
173
+
174
+ data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
175
+ data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
176
+
177
+ if mode=='intra':
178
+ # Intraday features
179
+ data['CurrentOpen30'] = data['Open30'].shift(-1)
180
+ data['CurrentHigh30'] = data['High30'].shift(-1)
181
+ data['CurrentLow30'] = data['Low30'].shift(-1)
182
+ data['CurrentClose30'] = data['Close30'].shift(-1)
183
+ data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
184
+ data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
185
+ data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
186
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
187
+
188
+ data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
189
+ data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
190
+
191
+ data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
192
+
193
+ # Open to High
194
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
195
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
196
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
197
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
198
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
199
+ data['CloseL1'] = np.where(data['Close30'] < data['L1'], 1, 0)
200
+ data['CloseL2'] = np.where(data['Close30'] < data['L2'], 1, 0)
201
+ data['CloseH1'] = np.where(data['Close30'] > data['H1'], 1, 0)
202
+ data['CloseH2'] = np.where(data['Close30'] > data['H2'], 1, 0)
203
+ data['CloseL1'] = data['CloseL1'].shift(-1)
204
+ data['CloseL2'] = data['CloseL2'].shift(-1)
205
+ data['CloseH1'] = data['CloseH1'].shift(-1)
206
+ data['CloseH2'] = data['CloseH2'].shift(-1)
207
+
208
+ def get_quintiles(df, col_name, q):
209
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
210
+
211
+ probas = []
212
+ # Given the current price level
213
+ for i, pct in enumerate(data['CurrentClose30toClose']):
214
+ try:
215
+ # Split
216
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
217
+ for q in df_q.index:
218
+ if q.left <= pct <= q.right:
219
+ p = df_q[q]
220
+ except:
221
+ p = None
222
+
223
+ probas.append(p)
224
+
225
+ data['GreenProbas'] = probas
226
+
227
+ engine = create_engine(
228
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
229
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
230
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
231
+ )
232
+
233
+ df_releases = pd.read_sql_query('select * from releases', con=engine)
234
+ df_releases = df_releases.set_index('Datetime')
235
+ data = data.merge(df_releases, how = 'left', left_index=True, right_index=True)
236
+
237
+ for n in tqdm(df_releases.columns, desc='Merging econ data'):
238
+ # Get the name of the release
239
+ # n = releases[rid]['name']
240
+ # Merge the corresponding DF of the release
241
+ # data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
242
+ # Create a column that shifts the value in the merged column up by 1
243
+ data[f'{n}_shift'] = data[n].shift(-1)
244
+ # Fill the rest with zeroes
245
+ data[n] = data[n].fillna(0)
246
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
247
+
248
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
249
+
250
+ def cumul_sum(col):
251
+ nums = []
252
+ s = 0
253
+ for x in col:
254
+ if x == 1:
255
+ s += 1
256
+ elif x == 0:
257
+ s = 0
258
+ nums.append(s)
259
+ return nums
260
+
261
+ consec_green = cumul_sum(data['GreenDay'].values)
262
+ consec_red = cumul_sum(data['RedDay'].values)
263
+
264
+ data['DaysGreen'] = consec_green
265
+ data['DaysRed'] = consec_red
266
+
267
+ final_row = data.index[-2]
268
+
269
+ if mode=='daily':
270
+ from dailyCols import model_cols
271
+
272
+ elif mode=='intra':
273
+ from intraCols import model_cols
274
+
275
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
276
+ df_final = df_final.dropna(subset=['Target','Target_clf'])
277
+ # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
278
+ return data, df_final, final_row
getIntraData.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pandas_datareader as pdr
3
+ import yfinance as yf
4
+ import datetime
5
+ # from datasets import load_dataset
6
+ from sqlalchemy import create_engine
7
+ import os
8
+ from getDailyData import data_start_date
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables from the .env file
12
+ load_dotenv()
13
+
14
+ def get_intra(periods_30m = 1):
15
+ '''
16
+ Method to get historical 30 minute data and append live data to it, if exists.
17
+ '''
18
+ engine = create_engine(
19
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
20
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
21
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
22
+ )
23
+
24
+ query = f'''SELECT
25
+ spx30.Datetime AS Datetime,
26
+ spx30.Open AS Open30,
27
+ spx30.High AS High30,
28
+ spx30.Low AS Low30,
29
+ spx30.Close AS Close30,
30
+ vix30.Open AS Open_VIX30,
31
+ vix30.High AS High_VIX30,
32
+ vix30.Low AS Low_VIX30,
33
+ vix30.Close AS Close_VIX30,
34
+ vvix30.Open AS Open_VVIX30,
35
+ vvix30.High AS High_VVIX30,
36
+ vvix30.Low AS Low_VVIX30,
37
+ vvix30.Close AS Close_VVIX30
38
+ FROM
39
+ SPX_full_30min AS spx30
40
+ LEFT JOIN
41
+ VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > {data_start_date}
42
+ LEFT JOIN
43
+ VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > {data_start_date}
44
+ WHERE
45
+ spx30.Datetime > {data_start_date}
46
+
47
+ '''
48
+ # spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
49
+ # vix30 = pd.read_sql_query(f'SELECT * FROM VIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
50
+ # vvix30 = pd.read_sql_query(f'SELECT * FROM VVIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
51
+ # dfs = []
52
+
53
+ df_30m = pd.read_sql_query(sql=query, con=engine)
54
+ df_30m['Datetime'] = df_30m['Datetime'].dt.tz_localize('America/New_York')
55
+ df_30m = df_30m.set_index('Datetime',drop=True)
56
+
57
+ # for fr in [spx30, vix30, vvix30]:
58
+ # # fr['Datetime'] = fr['Datetime'].apply(lambda x: datetime.datetime.strptime(x[:-6], dt_format))
59
+ # fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
60
+ # fr = fr.set_index('Datetime')
61
+ # fr['Open'] = pd.to_numeric(fr['Open'])
62
+ # fr['High'] = pd.to_numeric(fr['High'])
63
+ # fr['Low'] = pd.to_numeric(fr['Low'])
64
+ # fr['Close'] = pd.to_numeric(fr['Close'])
65
+ # dfs.append(fr[['Open','High','Low','Close']])
66
+
67
+ # df_30m = pd.concat(dfs, axis=1)
68
+
69
+ # df_30m.columns = [
70
+ # 'Open30',
71
+ # 'High30',
72
+ # 'Low30',
73
+ # 'Close30',
74
+ # 'Open_VIX30',
75
+ # 'High_VIX30',
76
+ # 'Low_VIX30',
77
+ # 'Close_VIX30',
78
+ # 'Open_VVIX30',
79
+ # 'High_VVIX30',
80
+ # 'Low_VVIX30',
81
+ # 'Close_VVIX30'
82
+ # ]
83
+
84
+ # Get incremental date
85
+ last_date = df_30m.index.date[-1]
86
+ last_date = last_date + datetime.timedelta(days=1)
87
+
88
+ # Get incremental data for each index
89
+ spx1 = yf.Ticker('^GSPC')
90
+ vix1 = yf.Ticker('^VIX')
91
+ vvix1 = yf.Ticker('^VVIX')
92
+ yfp = spx1.history(start=last_date, interval='30m')
93
+ yf_vix = vix1.history(start=last_date, interval='30m')
94
+ yf_vvix = vvix1.history(start=last_date, interval='30m')
95
+
96
+ if len(yfp) > 0:
97
+ # Convert indexes to EST if not already
98
+ for _df in [yfp, yf_vix, yf_vvix]:
99
+ if (_df.index.tz.zone != 'America/New_York') or (type(_df.index) != pd.DatetimeIndex):
100
+ _df['Datetime'] = pd.to_datetime(_df.index)
101
+ _df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
102
+ _df.set_index('Datetime', inplace=True)
103
+ # Concat them
104
+ df_inc = pd.concat([
105
+ yfp[['Open','High','Low','Close']],
106
+ yf_vix[['Open','High','Low','Close']],
107
+ yf_vvix[['Open','High','Low','Close']]
108
+ ], axis=1)
109
+ df_inc.columns = df_30m.columns
110
+ df_inc = df_inc.loc[
111
+ (df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
112
+ ]
113
+ df_30m = pd.concat([df_30m, df_inc])
114
+ else:
115
+ df_30m = df_30m.copy()
116
+
117
+ df_30m = df_30m.loc[
118
+ (df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
119
+ ]
120
+ df_30m['dt'] = df_30m.index.date
121
+ df_30m = df_30m.groupby('dt').head(periods_30m)
122
+ df_30m = df_30m.set_index('dt',drop=True)
123
+ df_30m.index.name = 'Datetime'
124
+
125
+ df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
126
+ df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
127
+ df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
128
+
129
+ opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
130
+ highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
131
+ lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
132
+ closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
133
+ spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
134
+ vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
135
+ vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
136
+
137
+ df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
138
+ return df_intra
139
+
140
+
intraCols.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_cols = [
2
+ 'BigNewsDay',
3
+ 'Quarter',
4
+ 'Perf5Day',
5
+ 'Perf5Day_n1',
6
+ 'DaysGreen',
7
+ 'DaysRed',
8
+ 'CurrentHigh30toClose',
9
+ 'CurrentLow30toClose',
10
+ 'CurrentClose30toClose',
11
+ 'CurrentRange30',
12
+ 'GapFill30',
13
+ 'CurrentGap',
14
+ 'RangePct',
15
+ 'RangePct_n1',
16
+ 'RangePct_n2',
17
+ 'OHLC4_VIX',
18
+ 'OHLC4_VIX_n1',
19
+ 'OHLC4_VIX_n2',
20
+ 'OHLC4_Current_Trend',
21
+ 'OHLC4_Trend',
22
+ 'CurrentVIXTrend',
23
+ 'SPX30IntraPerf',
24
+ 'VIX30IntraPerf',
25
+ 'VVIX30IntraPerf',
26
+ # 'OpenL1',
27
+ # 'OpenL2',
28
+ # 'OpenH1',
29
+ # 'OpenH2',
30
+ 'L1TouchPct',
31
+ 'L2TouchPct',
32
+ 'H1TouchPct',
33
+ 'H2TouchPct',
34
+ 'L1BreakPct',
35
+ 'L2BreakPct',
36
+ 'H1BreakPct',
37
+ 'H2BreakPct',
38
+ 'GreenProbas',
39
+ 'H1BreakTouchPct',
40
+ 'H2BreakTouchPct',
41
+ 'L1BreakTouchPct',
42
+ 'L2BreakTouchPct',
43
+ 'H1BreakH2TouchPct',
44
+ 'L1BreakL2TouchPct',
45
+ 'H1TouchGreenPct',
46
+ 'L1TouchRedPct'
47
+ # 'GapFillGreenProba'
48
+ ]
model_day_v2.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pandas_datareader as pdr
3
+ import yfinance as yf
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from typing import List
7
+ from tqdm import tqdm
8
+ from sklearn import linear_model
9
+ import os
10
+ import lightgbm as lgb
11
+ from dailyCols import model_cols
12
+
13
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
14
+
15
+ # Create an XGBRegressor model
16
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
17
+ model = linear_model.LinearRegression()
18
+
19
+ overall_results = []
20
+ # Iterate over the rows in the DataFrame, one step at a time
21
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
22
+ # Split the data into training and test sets
23
+ X_train = df.drop(target_column, axis=1).iloc[:i]
24
+ y_train = df[target_column].iloc[:i]
25
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
26
+ y_test = df[target_column].iloc[i:i+num_periods]
27
+
28
+ # Fit the model to the training data
29
+ model.fit(X_train, y_train)
30
+
31
+ # Make a prediction on the test data
32
+ predictions = model.predict(X_test)
33
+
34
+ # Create a DataFrame to store the true and predicted values
35
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
36
+
37
+ overall_results.append(result_df)
38
+
39
+ df_results = pd.concat(overall_results)
40
+ # model.save_model('model_lr.bin')
41
+ # Return the true and predicted values, and fitted model
42
+ return df_results, model
43
+
44
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
45
+
46
+ # Create run the regression model to get its target
47
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
48
+ # joblib.dump(model1, 'model1.bin')
49
+
50
+ # Merge the result df back on the df for feeding into the classifier
51
+ for_merge = res[['Predicted']]
52
+ for_merge.columns = ['RegrModelOut']
53
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
54
+ df = df.merge(for_merge, left_index=True, right_index=True)
55
+ df = df.drop(columns=[target_column_regr])
56
+ df = df[model_cols + ['RegrModelOut', target_column_clf]]
57
+
58
+ df[target_column_clf] = df[target_column_clf].astype(bool)
59
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
60
+
61
+ # Create an XGBRegressor model
62
+ # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
63
+ model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
64
+ # model = linear_model.LogisticRegression(max_iter=1500)
65
+
66
+ overall_results = []
67
+ # Iterate over the rows in the DataFrame, one step at a time
68
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
69
+ # Split the data into training and test sets
70
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
71
+ y_train = df[target_column_clf].iloc[:i]
72
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
73
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
74
+
75
+ # Fit the model to the training data
76
+ model2.fit(X_train, y_train)
77
+
78
+ # Make a prediction on the test data
79
+ predictions = model2.predict_proba(X_test)[:,-1]
80
+
81
+ # Create a DataFrame to store the true and predicted values
82
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
83
+
84
+ overall_results.append(result_df)
85
+
86
+ df_results = pd.concat(overall_results)
87
+
88
+ # Calibrate Probabilities
89
+ def get_quantiles(df, col_name, q):
90
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
91
+
92
+ greenprobas = []
93
+ meanprobas = []
94
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
95
+ try:
96
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
97
+ for q in df_q.index:
98
+ if q.left <= pct <= q.right:
99
+ p = df_q[q]
100
+ c = (q.left + q.right) / 2
101
+ except:
102
+ p = None
103
+ c = None
104
+
105
+ greenprobas.append(p)
106
+ meanprobas.append(c)
107
+
108
+ df_results['CalibPredicted'] = greenprobas
109
+
110
+ return df_results, model1, model2
111
+
112
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
113
+ regr_pred = trained_reg_model.predict(df)
114
+ regr_pred = regr_pred > 0
115
+ new_df = df.copy()
116
+ new_df['RegrModelOut'] = regr_pred
117
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
118
+ return clf_pred_proba
model_intra_v2.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pandas_datareader as pdr
4
+ import numpy as np
5
+ import yfinance as yf
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from typing import List
9
+ from tqdm import tqdm
10
+ import os
11
+ import datetime
12
+ from pandas.tseries.offsets import BDay
13
+ from datasets import load_dataset
14
+ import lightgbm as lgb
15
+ from sklearn.model_selection import TimeSeriesSplit
16
+ from intraCols import model_cols
17
+
18
+ # If the dataset is gated/private, make sure you have run huggingface-cli login
19
+ def walk_forward_validation(df, target_column, num_periods):
20
+
21
+ df = df[model_cols + [target_column]]
22
+ df[target_column] = df[target_column].astype(bool)
23
+
24
+ # Model
25
+ # model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
26
+
27
+ tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
28
+
29
+ overall_results = []
30
+ # Iterate over the rows in the DataFrame, one step at a time
31
+ # Split the time series data using TimeSeriesSplit
32
+ for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
33
+ # Extract the training and testing data for the current split
34
+ X_train = df.drop(target_column, axis=1).iloc[train_index]
35
+ y_train = df[target_column].iloc[train_index]
36
+ X_test = df.drop(target_column, axis=1).iloc[test_index]
37
+ y_test = df[target_column].iloc[test_index]
38
+
39
+ y_train = y_train.astype(bool)
40
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
41
+ model.fit(X_train, y_train)
42
+ # Make a prediction on the test data
43
+ predictions = model.predict_proba(X_test)[:,-1]
44
+
45
+ # Create a DataFrame to store the true and predicted values
46
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
47
+ overall_results.append(result_df)
48
+
49
+ df_results = pd.concat(overall_results)
50
+
51
+ # Calibrate Probabilities
52
+ def get_quantiles(df, col_name, q):
53
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
54
+
55
+ greenprobas = []
56
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
57
+ try:
58
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
59
+ for q in df_q.index:
60
+ if q.left <= pct <= q.right:
61
+ p = df_q[q]
62
+ except:
63
+ p = None
64
+
65
+ greenprobas.append(p)
66
+
67
+ df_results['CalibPredicted'] = greenprobas
68
+
69
+ return df_results, model
70
+
71
+ def seq_predict_proba(df, trained_clf_model):
72
+ clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
73
+ return clf_pred_proba
requirements.txt CHANGED
@@ -17,4 +17,5 @@ scipy
17
  datasets
18
  huggingface_hub
19
  holidays
20
- pytz
 
 
17
  datasets
18
  huggingface_hub
19
  holidays
20
+ pytz
21
+ sqlalchemy