hellno-o commited on
Commit
4dd5bdd
1 Parent(s): 1e601a1

Convert python script into streamlit app - first steps

Browse files
Files changed (1) hide show
  1. app.py +190 -124
app.py CHANGED
@@ -3,9 +3,11 @@ import streamlit as st
3
  st.title('Numerai Example Script')
4
 
5
 
6
- # content below from
7
  # https://github.com/numerai/example-scripts/blob/master/example_model.py
8
- #
 
 
9
 
10
  import pandas as pd
11
  from lightgbm import LGBMRegressor
@@ -27,144 +29,208 @@ from utils import (
27
  )
28
 
29
 
30
- # download all the things
31
-
32
  napi = NumerAPI()
33
-
34
  current_round = napi.get_current_round()
35
 
36
  # Tournament data changes every week so we specify the round in their name. Training
37
- # and validation data only change periodically, so no need to download them every time.
38
- print('Downloading dataset files...')
39
 
40
  Path("./v4").mkdir(parents=False, exist_ok=True)
41
- napi.download_dataset("v4/train.parquet")
42
- napi.download_dataset("v4/validation.parquet")
43
- napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
44
- napi.download_dataset("v4/validation_example_preds.parquet")
45
- napi.download_dataset("v4/features.json")
46
-
47
- print('Reading minimal training data')
48
- # read the feature metadata and get a feature set (or all the features)
49
- with open("v4/features.json", "r") as f:
50
- feature_metadata = json.load(f)
51
- # features = list(feature_metadata["feature_stats"].keys()) # get all the features
52
- # features = feature_metadata["feature_sets"]["small"] # get the small feature set
53
- features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
54
- # read in just those features along with era and target columns
55
- read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
56
-
57
- # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
58
- # if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
59
- training_data = pd.read_parquet('v4/train.parquet',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  columns=read_columns)
61
- validation_data = pd.read_parquet('v4/validation.parquet',
62
- columns=read_columns)
63
- live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
64
- columns=read_columns)
65
 
 
 
 
66
 
67
- # pare down the number of eras to every 4th era
68
- # every_4th_era = training_data[ERA_COL].unique()[::4]
69
- # training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
 
70
 
71
- # getting the per era correlation of each feature vs the target
72
- all_feature_corrs = training_data.groupby(ERA_COL).apply(
73
- lambda era: era[features].corrwith(era[TARGET_COL])
74
- )
75
 
76
- # find the riskiest features by comparing their correlation vs
77
- # the target in each half of training data; we'll use these later
78
- riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
 
79
 
80
- # "garbage collection" (gc) gets rid of unused data and frees up memory
81
- gc.collect()
 
82
 
83
- model_name = f"model_target"
84
- print(f"Checking for existing model '{model_name}'")
85
- model = load_model(model_name)
86
- if not model:
87
- print(f"model not found, creating new one")
88
- params = {"n_estimators": 2000,
89
- "learning_rate": 0.01,
90
- "max_depth": 5,
91
- "num_leaves": 2 ** 5,
92
- "colsample_bytree": 0.1}
93
-
94
- model = LGBMRegressor(**params)
95
-
96
- # train on all of train and save the model so we don't have to train next time
97
- model.fit(training_data.filter(like='feature_', axis='columns'),
98
- training_data[TARGET_COL])
99
- print(f"saving new model: {model_name}")
100
- save_model(model, model_name)
101
-
102
- gc.collect()
103
-
104
- nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()
105
-
106
- # check for nans and fill nans
107
- if nans_per_col.any():
108
- total_rows = len(live_data[live_data["data_type"] == "live"])
109
- print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
110
- print(f"out of {total_rows} total rows")
111
- print(f"filling nans with 0.5")
112
- live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
113
-
114
- else:
115
- print("No nans in the features this week!")
116
-
117
-
118
- # double check the feature that the model expects vs what is available to prevent our
119
- # pipeline from failing if Numerai adds more data and we don't have time to retrain!
120
- model_expected_features = model.booster_.feature_name()
121
- if set(model_expected_features) != set(features):
122
- print(f"New features are available! Might want to retrain model {model_name}.")
123
- validation_data.loc[:, f"preds_{model_name}"] = model.predict(
124
- validation_data.loc[:, model_expected_features])
125
- live_data.loc[:, f"preds_{model_name}"] = model.predict(
126
- live_data.loc[:, model_expected_features])
127
-
128
- gc.collect()
129
-
130
- # neutralize our predictions to the riskiest features
131
- validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
132
- df=validation_data,
133
- columns=[f"preds_{model_name}"],
134
- neutralizers=riskiest_features,
135
- proportion=1.0,
136
- normalize=True,
137
- era_col=ERA_COL
138
- )
139
 
140
- live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
141
- df=live_data,
142
- columns=[f"preds_{model_name}"],
143
- neutralizers=riskiest_features,
144
- proportion=1.0,
145
- normalize=True,
146
- era_col=ERA_COL
147
- )
148
 
149
- model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
150
 
151
- # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
152
- validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
153
- live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
154
- validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
155
- live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")
156
 
157
- validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
158
- validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
 
 
 
 
159
 
160
- # get some stats about each of our models to compare...
161
- # fast_mode=True so that we skip some of the stats that are slower to calculate
162
- validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
163
- print(validation_stats[["mean", "sharpe"]].to_markdown())
164
 
165
- print(f'''
166
- Done! Next steps:
167
- 1. Go to numer.ai/tournament (make sure you have an account)
168
- 2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
169
- 3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
170
- ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  st.title('Numerai Example Script')
4
 
5
 
6
+ # content below adapted from
7
  # https://github.com/numerai/example-scripts/blob/master/example_model.py
8
+ #
9
+
10
+ IS_RUNNING_IN_HUGGING_FACE = False
11
 
12
  import pandas as pd
13
  from lightgbm import LGBMRegressor
 
29
  )
30
 
31
 
 
 
32
  napi = NumerAPI()
 
33
  current_round = napi.get_current_round()
34
 
35
  # Tournament data changes every week so we specify the round in their name. Training
36
+ # and validation data only change periodically, so no need to download
37
+ # them every time.
38
 
39
  Path("./v4").mkdir(parents=False, exist_ok=True)
40
+
41
+
42
+ @st.cache
43
+ def download_dataset():
44
+ print('download_dataset')
45
+
46
+ if IS_RUNNING_IN_HUGGING_FACE:
47
+ from datasets import load_dataset_builder
48
+ ds_builder = load_dataset_builder("Numerati/numerai-datasets")
49
+ else:
50
+ napi.download_dataset("v4/train.parquet")
51
+ napi.download_dataset("v4/validation.parquet")
52
+ napi.download_dataset("v4/validation_example_preds.parquet")
53
+ napi.download_dataset("v4/features.json")
54
+
55
+ napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
56
+ print('done download_dataset')
57
+
58
+ @st.cache
59
+ def load_dataset(feature_set: str):
60
+ print('load_dataset with feature_set', feature_set)
61
+ # read the feature metadata and get a feature set (or all the features)
62
+ with open("v4/features.json", "r") as f:
63
+ feature_metadata = json.load(f)
64
+ # features = list(feature_metadata["feature_stats"].keys()) # get all the features
65
+ # features = feature_metadata["feature_sets"]["small"] # get the small
66
+ # feature set
67
+ features = feature_metadata["feature_sets"][feature_set] # get the medium feature set
68
+ # read in just those features along with era and target columns
69
+ read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
70
+
71
+ # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
72
+ # if so, delete the file and rerun the napi.download_dataset to fix the
73
+ # corrupted file
74
+ training_data = pd.read_parquet('v4/train.parquet',
75
+ columns=read_columns)
76
+ validation_data = pd.read_parquet('v4/validation.parquet',
77
+ columns=read_columns)
78
+ live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
79
  columns=read_columns)
 
 
 
 
80
 
81
+ # pare down the number of eras to every 4th era
82
+ # every_4th_era = training_data[ERA_COL].unique()[::4]
83
+ # training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
84
 
85
+ # getting the per era correlation of each feature vs the target
86
+ all_feature_corrs = training_data.groupby(ERA_COL).apply(
87
+ lambda era: era[features].corrwith(era[TARGET_COL])
88
+ )
89
 
90
+ # find the riskiest features by comparing their correlation vs
91
+ # the target in each half of training data; we'll use these later
92
+ riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
 
93
 
94
+ # "garbage collection" (gc) gets rid of unused data and frees up memory
95
+ gc.collect()
96
+ print('done with feature_set', feature_set)
97
+ return training_data, validation_data, live_data, features, riskiest_features
98
 
99
+ feature_set = st.selectbox(
100
+ 'Which feature set should be used?',
101
+ ('small', 'medium', 'fncv3_features', 'v2_equivalent_features', 'v3_equivalent_features'))
102
 
103
+ data_load_state = st.text('Loading data...')
104
+ download_dataset()
105
+ training_data, validation_data, live_data, features, riskiest_features = load_dataset(feature_set)
106
+ data_load_state.text('Loading data...done!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ st.subheader('Raw data')
109
+ st.write(training_data.head())
 
 
 
 
 
 
110
 
111
+ st.subheader('Model Configuration')
112
 
113
+ n_estimators = st.slider('n_estimators', 100, 10000, 2000)
114
+ learning_rate = st.slider('learning_rate', 0.0001, 0.1, 0.01)
115
+ max_depth = st.slider('max_depth', 2, 20, 5)
 
 
116
 
117
+ params = {"n_estimators": n_estimators,
118
+ "learning_rate": learning_rate,
119
+ "max_depth": max_depth,
120
+ "num_leaves": 2 ** 5,
121
+ "colsample_bytree": 0.1
122
+ }
123
 
124
+ model_name = f"model_target"
 
 
 
125
 
126
+ @st.cache
127
+ def get_model_and_fit(model_name, *params):
128
+ print('get_model_and_fit')
129
+ model = load_model(model_name)
130
+ if not model:
131
+ with st.spinner('Wait model training...'):
132
+ print(f"model not found, creating new one")
133
+
134
+ model = LGBMRegressor(**params)
135
+
136
+ # train on all of train and save the model so we don't have to
137
+ # train next time
138
+ model.fit(training_data.filter(like='feature_', axis='columns'),
139
+ training_data[TARGET_COL])
140
+ print(f"saving new model: {model_name}")
141
+ save_model(model, model_name)
142
+ st.success('Done model training!')
143
+
144
+ gc.collect()
145
+ print('done get_model_and_fit')
146
+
147
+
148
+
149
+ has_model_preds = False
150
+
151
+ @st.cache
152
+ def get_model_preds(model_name, *params):
153
+ print('get_model_preds')
154
+ model = load_model(model_name)
155
+
156
+ has_model_preds = False
157
+ nans_per_col = live_data[live_data["data_type"]
158
+ == "live"][features].isna().sum()
159
+
160
+ # check for nans and fill nans
161
+ if nans_per_col.any():
162
+ total_rows = len(live_data[live_data["data_type"] == "live"])
163
+ print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
164
+ print(f"out of {total_rows} total rows")
165
+ print(f"filling nans with 0.5")
166
+ live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
167
+
168
+ else:
169
+ print("No nans in the features this week!")
170
+
171
+
172
+ # double check the feature that the model expects vs what is available to prevent our
173
+ # pipeline from failing if Numerai adds more data and we don't have time
174
+ # to retrain!
175
+ model_expected_features = model.booster_.feature_name()
176
+ if set(model_expected_features) != set(features):
177
+ print(f"New features are available! Might want to retrain model {model_name}.")
178
+ validation_data.loc[:, f"preds_{model_name}"] = model.predict(
179
+ validation_data.loc[:, model_expected_features])
180
+ live_data.loc[:, f"preds_{model_name}"] = model.predict(
181
+ live_data.loc[:, model_expected_features])
182
+
183
+ gc.collect()
184
+
185
+ # neutralize our predictions to the riskiest features
186
+ validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
187
+ df=validation_data,
188
+ columns=[f"preds_{model_name}"],
189
+ neutralizers=riskiest_features,
190
+ proportion=1.0,
191
+ normalize=True,
192
+ era_col=ERA_COL
193
+ )
194
+
195
+ live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
196
+ df=live_data,
197
+ columns=[f"preds_{model_name}"],
198
+ neutralizers=riskiest_features,
199
+ proportion=1.0,
200
+ normalize=True,
201
+ era_col=ERA_COL
202
+ )
203
+
204
+ model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
205
+
206
+ # rename best model to "prediction" and rank from 0 to 1 to meet upload
207
+ # requirements
208
+ validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
209
+ live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
210
+ validation_prediction_fname = f"validation_predictions_{current_round}.csv"
211
+ validation_data["prediction"].to_csv(validation_prediction_fname)
212
+ live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")
213
+
214
+ validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
215
+ validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
216
+
217
+ # get some stats about each of our models to compare...
218
+ # fast_mode=True so that we skip some of the stats that are slower to calculate
219
+ print('start validation_metrics')
220
+ validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
221
+ st.markdown(validation_stats[["mean", "sharpe"]].to_markdown())
222
+
223
+ # st.write(f'''
224
+ # Done! Next steps:
225
+ # 1. Go to numer.ai/tournament (make sure you have an account)
226
+ # 2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
227
+ # 3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
228
+ # ''')
229
+ has_model_preds = True
230
+
231
+
232
+ st.button('Start model training', on_click=get_model_and_fit, args=[model_name, params])
233
+ st.button('Start model evaluation', on_click=get_model_preds, args=[model_name, params])
234
+
235
+ if has_model_preds:
236
+ st.download_button('Validation data for diagnostics tool', validation_data["prediction"], validation_prediction_fname)