hellno-o commited on
Commit
2d90381
1 Parent(s): bd087ef

Add example script and requirements

Browse files
Files changed (3) hide show
  1. app.py +170 -0
  2. example-scripts +1 -0
  3. requirements.txt +100 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.title('Numerai Example Script')
4
+
5
+
6
+ # content below from
7
+ # https://github.com/numerai/example-scripts/blob/master/example_model.py
8
+ #
9
+
10
+ import pandas as pd
11
+ from lightgbm import LGBMRegressor
12
+ import gc
13
+ import json
14
+ from pathlib import Path
15
+
16
+ from numerapi import NumerAPI
17
+ from utils import (
18
+ save_model,
19
+ load_model,
20
+ neutralize,
21
+ get_biggest_change_features,
22
+ validation_metrics,
23
+ ERA_COL,
24
+ DATA_TYPE_COL,
25
+ TARGET_COL,
26
+ EXAMPLE_PREDS_COL
27
+ )
28
+
29
+
30
+ # download all the things
31
+
32
+ napi = NumerAPI()
33
+
34
+ current_round = napi.get_current_round()
35
+
36
+ # Tournament data changes every week so we specify the round in their name. Training
37
+ # and validation data only change periodically, so no need to download them every time.
38
+ print('Downloading dataset files...')
39
+
40
+ Path("./v4").mkdir(parents=False, exist_ok=True)
41
+ napi.download_dataset("v4/train.parquet")
42
+ napi.download_dataset("v4/validation.parquet")
43
+ napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
44
+ napi.download_dataset("v4/validation_example_preds.parquet")
45
+ napi.download_dataset("v4/features.json")
46
+
47
+ print('Reading minimal training data')
48
+ # read the feature metadata and get a feature set (or all the features)
49
+ with open("v4/features.json", "r") as f:
50
+ feature_metadata = json.load(f)
51
+ # features = list(feature_metadata["feature_stats"].keys()) # get all the features
52
+ # features = feature_metadata["feature_sets"]["small"] # get the small feature set
53
+ features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
54
+ # read in just those features along with era and target columns
55
+ read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
56
+
57
+ # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
58
+ # if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
59
+ training_data = pd.read_parquet('v4/train.parquet',
60
+ columns=read_columns)
61
+ validation_data = pd.read_parquet('v4/validation.parquet',
62
+ columns=read_columns)
63
+ live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
64
+ columns=read_columns)
65
+
66
+
67
+ # pare down the number of eras to every 4th era
68
+ # every_4th_era = training_data[ERA_COL].unique()[::4]
69
+ # training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
70
+
71
+ # getting the per era correlation of each feature vs the target
72
+ all_feature_corrs = training_data.groupby(ERA_COL).apply(
73
+ lambda era: era[features].corrwith(era[TARGET_COL])
74
+ )
75
+
76
+ # find the riskiest features by comparing their correlation vs
77
+ # the target in each half of training data; we'll use these later
78
+ riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
79
+
80
+ # "garbage collection" (gc) gets rid of unused data and frees up memory
81
+ gc.collect()
82
+
83
+ model_name = f"model_target"
84
+ print(f"Checking for existing model '{model_name}'")
85
+ model = load_model(model_name)
86
+ if not model:
87
+ print(f"model not found, creating new one")
88
+ params = {"n_estimators": 2000,
89
+ "learning_rate": 0.01,
90
+ "max_depth": 5,
91
+ "num_leaves": 2 ** 5,
92
+ "colsample_bytree": 0.1}
93
+
94
+ model = LGBMRegressor(**params)
95
+
96
+ # train on all of train and save the model so we don't have to train next time
97
+ model.fit(training_data.filter(like='feature_', axis='columns'),
98
+ training_data[TARGET_COL])
99
+ print(f"saving new model: {model_name}")
100
+ save_model(model, model_name)
101
+
102
+ gc.collect()
103
+
104
+ nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()
105
+
106
+ # check for nans and fill nans
107
+ if nans_per_col.any():
108
+ total_rows = len(live_data[live_data["data_type"] == "live"])
109
+ print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
110
+ print(f"out of {total_rows} total rows")
111
+ print(f"filling nans with 0.5")
112
+ live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
113
+
114
+ else:
115
+ print("No nans in the features this week!")
116
+
117
+
118
+ # double check the feature that the model expects vs what is available to prevent our
119
+ # pipeline from failing if Numerai adds more data and we don't have time to retrain!
120
+ model_expected_features = model.booster_.feature_name()
121
+ if set(model_expected_features) != set(features):
122
+ print(f"New features are available! Might want to retrain model {model_name}.")
123
+ validation_data.loc[:, f"preds_{model_name}"] = model.predict(
124
+ validation_data.loc[:, model_expected_features])
125
+ live_data.loc[:, f"preds_{model_name}"] = model.predict(
126
+ live_data.loc[:, model_expected_features])
127
+
128
+ gc.collect()
129
+
130
+ # neutralize our predictions to the riskiest features
131
+ validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
132
+ df=validation_data,
133
+ columns=[f"preds_{model_name}"],
134
+ neutralizers=riskiest_features,
135
+ proportion=1.0,
136
+ normalize=True,
137
+ era_col=ERA_COL
138
+ )
139
+
140
+ live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
141
+ df=live_data,
142
+ columns=[f"preds_{model_name}"],
143
+ neutralizers=riskiest_features,
144
+ proportion=1.0,
145
+ normalize=True,
146
+ era_col=ERA_COL
147
+ )
148
+
149
+ model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
150
+
151
+ # rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
152
+ validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
153
+ live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
154
+ validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
155
+ live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")
156
+
157
+ validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
158
+ validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
159
+
160
+ # get some stats about each of our models to compare...
161
+ # fast_mode=True so that we skip some of the stats that are slower to calculate
162
+ validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
163
+ print(validation_stats[["mean", "sharpe"]].to_markdown())
164
+
165
+ print(f'''
166
+ Done! Next steps:
167
+ 1. Go to numer.ai/tournament (make sure you have an account)
168
+ 2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
169
+ 3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
170
+ ''')
example-scripts ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1
requirements.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with python 3.10
3
+ # To update, run:
4
+ #
5
+ # pip-compile
6
+ #
7
+ certifi==2021.10.8
8
+ # via requests
9
+ charset-normalizer==2.0.12
10
+ # via requests
11
+ click==8.0.4
12
+ # via numerapi
13
+ colorama==0.4.4
14
+ # via
15
+ # halo
16
+ # log-symbols
17
+ cycler==0.11.0
18
+ # via matplotlib
19
+ fonttools==4.31.1
20
+ # via matplotlib
21
+ halo==0.0.31
22
+ # via -r requirements.in
23
+ idna==3.3
24
+ # via requests
25
+ joblib==1.1.0
26
+ # via scikit-learn
27
+ kiwisolver==1.4.0
28
+ # via matplotlib
29
+ lightgbm==3.3.2
30
+ # via -r requirements.in
31
+ log-symbols==0.0.14
32
+ # via halo
33
+ matplotlib==3.5.1
34
+ # via -r requirements.in
35
+ numerapi==2.9.4
36
+ # via -r requirements.in
37
+ numpy==1.22.3
38
+ # via
39
+ # -r requirements.in
40
+ # lightgbm
41
+ # matplotlib
42
+ # pandas
43
+ # pyarrow
44
+ # scikit-learn
45
+ # scipy
46
+ packaging==21.3
47
+ # via matplotlib
48
+ pandas==1.4.1
49
+ # via
50
+ # -r requirements.in
51
+ # numerapi
52
+ pillow==9.0.1
53
+ # via matplotlib
54
+ pyarrow==7.0.0
55
+ # via -r requirements.in
56
+ pyparsing==3.0.7
57
+ # via
58
+ # matplotlib
59
+ # packaging
60
+ python-dateutil==2.8.2
61
+ # via
62
+ # matplotlib
63
+ # numerapi
64
+ # pandas
65
+ pytz==2022.1
66
+ # via
67
+ # numerapi
68
+ # pandas
69
+ requests==2.27.1
70
+ # via
71
+ # -r requirements.in
72
+ # numerapi
73
+ scikit-learn==1.0.2
74
+ # via
75
+ # -r requirements.in
76
+ # lightgbm
77
+ scipy==1.8.0
78
+ # via
79
+ # -r requirements.in
80
+ # lightgbm
81
+ # scikit-learn
82
+ six==1.16.0
83
+ # via
84
+ # -r requirements.in
85
+ # halo
86
+ # python-dateutil
87
+ spinners==0.0.24
88
+ # via halo
89
+ tabulate==0.8.9
90
+ # via -r requirements.in
91
+ termcolor==1.1.0
92
+ # via halo
93
+ threadpoolctl==3.1.0
94
+ # via scikit-learn
95
+ tqdm==4.63.0
96
+ # via numerapi
97
+ urllib3==1.26.9
98
+ # via requests
99
+ wheel==0.37.1
100
+ # via lightgbm