File size: 9,067 Bytes
2d90381
 
 
 
 
4dd5bdd
2d90381
4dd5bdd
 
2d90381
 
 
 
 
 
44be985
2d90381
 
 
 
 
 
 
 
 
 
 
 
 
44be985
2d90381
 
 
 
 
4dd5bdd
 
2d90381
 
4dd5bdd
 
44be985
4dd5bdd
 
 
44be985
4dd5bdd
44be985
 
 
 
 
 
4dd5bdd
 
 
 
 
 
 
 
 
 
2501733
 
4dd5bdd
2501733
4dd5bdd
 
 
 
 
 
 
 
 
 
 
2501733
4dd5bdd
2501733
4dd5bdd
 
2d90381
 
4dd5bdd
 
 
2d90381
4dd5bdd
 
 
 
2d90381
4dd5bdd
 
 
2d90381
4dd5bdd
 
 
 
2d90381
4dd5bdd
 
 
2d90381
4dd5bdd
 
 
 
2d90381
4dd5bdd
 
2d90381
4dd5bdd
2d90381
4dd5bdd
 
 
2d90381
4dd5bdd
 
 
 
 
 
2d90381
4dd5bdd
2d90381
4dd5bdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2501733
4dd5bdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import streamlit as st

st.title('Numerai Example Script')


# content below adapted from
# https://github.com/numerai/example-scripts/blob/master/example_model.py
#


import pandas as pd
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
import os
from numerapi import NumerAPI
from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)

IS_RUNNING_IN_HUGGING_FACE = os.environ.get('HF_ENDPOINT') is not None

napi = NumerAPI()
current_round = napi.get_current_round()

# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download
# them every time.

Path("./v4").mkdir(parents=False, exist_ok=True)

@st.cache
def get_dataset_path():
    if IS_RUNNING_IN_HUGGING_FACE:
        from datasets import load_dataset_builder
        ds_builder = load_dataset_builder("Numerati/numerai-datasets")
        return ds_builder.cache_dir
    else:
        return "./v4"

@st.cache
def download_dataset():
    print('download_dataset')
    if IS_RUNNING_IN_HUGGING_FACE:
        napi.download_dataset("v4/train.parquet")
        napi.download_dataset("v4/validation.parquet")
        napi.download_dataset("v4/validation_example_preds.parquet")
        napi.download_dataset("v4/features.json")

    napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
    print('done download_dataset')

@st.cache
def load_dataset(feature_set: str):
    dataset_path = get_dataset_path()
    print(f'load_dataset with feature_set {feature_set} and path {dataset_path}')
    # read the feature metadata and get a feature set (or all the features)
    with open(f"{dataset_path}/features.json", "r") as f:
        feature_metadata = json.load(f)
    # features = list(feature_metadata["feature_stats"].keys()) # get all the features
    # features = feature_metadata["feature_sets"]["small"] # get the small
    # feature set
    features = feature_metadata["feature_sets"][feature_set]  # get the medium feature set
    # read in just those features along with era and target columns
    read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

    # note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
    # if so, delete the file and rerun the napi.download_dataset to fix the
    # corrupted file
    training_data = pd.read_parquet(f'{dataset_path}/train.parquet',
                                    columns=read_columns)
    validation_data = pd.read_parquet(f'{dataset_path}/validation.parquet',
                                      columns=read_columns)
    live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
                                columns=read_columns)

    # pare down the number of eras to every 4th era
    # every_4th_era = training_data[ERA_COL].unique()[::4]
    # training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]

    # getting the per era correlation of each feature vs the target
    all_feature_corrs = training_data.groupby(ERA_COL).apply(
        lambda era: era[features].corrwith(era[TARGET_COL])
    )

    # find the riskiest features by comparing their correlation vs
    # the target in each half of training data; we'll use these later
    riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

    # "garbage collection" (gc) gets rid of unused data and frees up memory
    gc.collect()
    print('done  with feature_set', feature_set)
    return training_data, validation_data, live_data, features, riskiest_features

feature_set = st.selectbox(
    'Which feature set should be used?',
    ('small', 'medium', 'fncv3_features', 'v2_equivalent_features', 'v3_equivalent_features'))

data_load_state = st.text('Loading data...')
download_dataset()
training_data, validation_data, live_data, features, riskiest_features = load_dataset(feature_set)
data_load_state.text('Loading data...done!')

st.subheader('Raw data')
st.write(training_data.head())

st.subheader('Model Configuration')

n_estimators = st.slider('n_estimators', 100, 10000, 2000)
learning_rate = st.slider('learning_rate', 0.0001, 0.1, 0.01)
max_depth = st.slider('max_depth', 2, 20, 5)

params = {"n_estimators": n_estimators,
          "learning_rate": learning_rate,
          "max_depth": max_depth,
          "num_leaves": 2 ** 5,
          "colsample_bytree": 0.1
          }

model_name = f"model_target"

@st.cache
def get_model_and_fit(model_name, *params):
    print('get_model_and_fit')
    model = load_model(model_name)
    if not model:
        with st.spinner('Wait model training...'):
            print(f"model not found, creating new one")

            model = LGBMRegressor(**params)

            # train on all of train and save the model so we don't have to
            # train next time
            model.fit(training_data.filter(like='feature_', axis='columns'),
                      training_data[TARGET_COL])
            print(f"saving new model: {model_name}")
            save_model(model, model_name)
        st.success('Done model training!')

    gc.collect()
    print('done get_model_and_fit')



has_model_preds = False

@st.cache
def get_model_preds(model_name, *params):
    print('get_model_preds')
    model = load_model(model_name)

    has_model_preds = False
    nans_per_col = live_data[live_data["data_type"]
                             == "live"][features].isna().sum()

    # check for nans and fill nans
    if nans_per_col.any():
        total_rows = len(live_data[live_data["data_type"] == "live"])
        print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
        print(f"out of {total_rows} total rows")
        print(f"filling nans with 0.5")
        live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)

    else:
        print("No nans in the features this week!")


    # double check the feature that the model expects vs what is available to prevent our
    # pipeline from failing if Numerai adds more data and we don't have time
    # to retrain!
    model_expected_features = model.booster_.feature_name()
    if set(model_expected_features) != set(features):
        print(f"New features are available! Might want to retrain model {model_name}.")
    validation_data.loc[:, f"preds_{model_name}"] = model.predict(
        validation_data.loc[:, model_expected_features])
    live_data.loc[:, f"preds_{model_name}"] = model.predict(
        live_data.loc[:, model_expected_features])

    gc.collect()

    # neutralize our predictions to the riskiest features
    validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
        df=validation_data,
        columns=[f"preds_{model_name}"],
        neutralizers=riskiest_features,
        proportion=1.0,
        normalize=True,
        era_col=ERA_COL
    )

    live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
        df=live_data,
        columns=[f"preds_{model_name}"],
        neutralizers=riskiest_features,
        proportion=1.0,
        normalize=True,
        era_col=ERA_COL
    )

    model_to_submit = f"preds_{model_name}_neutral_riskiest_50"

    # rename best model to "prediction" and rank from 0 to 1 to meet upload
    # requirements
    validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
    live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
    validation_prediction_fname = f"validation_predictions_{current_round}.csv"
    validation_data["prediction"].to_csv(validation_prediction_fname)
    live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")

    validation_preds = pd.read_parquet(f'{get_dataset_path()}/validation_example_preds.parquet')
    validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]

    # get some stats about each of our models to compare...
    # fast_mode=True so that we skip some of the stats that are slower to calculate
    print('start validation_metrics')
    validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
    st.markdown(validation_stats[["mean", "sharpe"]].to_markdown())

    # st.write(f'''
    # Done! Next steps:
    #     1. Go to numer.ai/tournament (make sure you have an account)
    #     2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
    #     3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
    # ''')
    has_model_preds = True


st.button('Start model training', on_click=get_model_and_fit, args=[model_name, params])
st.button('Start model evaluation', on_click=get_model_preds, args=[model_name, params])

if has_model_preds:
    st.download_button('Validation data for diagnostics tool', validation_data["prediction"], validation_prediction_fname)