hellno-o commited on
Commit
1e601a1
1 Parent(s): 2d90381

add python files from official repo

Browse files
Files changed (3) hide show
  1. example-scripts +0 -1
  2. example_model_advanced.py +296 -0
  3. utils.py +312 -0
example-scripts DELETED
@@ -1 +0,0 @@
1
- Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1
 
 
example_model_advanced.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from lightgbm import LGBMRegressor
3
+ import gc
4
+ from numerapi import NumerAPI
5
+ from pathlib import Path
6
+ from utils import (
7
+ save_model,
8
+ load_model,
9
+ neutralize,
10
+ get_biggest_change_features,
11
+ get_time_series_cross_val_splits,
12
+ validation_metrics,
13
+ load_model_config,
14
+ save_model_config,
15
+ save_prediction,
16
+ TARGET_COL,
17
+ )
18
+
19
+
20
+ EXAMPLE_PREDS_COL = "example_preds"
21
+ ERA_COL = "era"
22
+ # params we'll use to train all of our models.
23
+ # Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
24
+ model_params = {"n_estimators": 2000,
25
+ "learning_rate": 0.01,
26
+ "max_depth": 5,
27
+ "num_leaves": 2 ** 5,
28
+ "colsample_bytree": 0.1}
29
+
30
+ # the amount of downsampling we'll use to speed up cross validation and full train.
31
+ # a value of 1 means no downsampling
32
+ # a value of 10 means use every 10th row
33
+ downsample_cross_val = 20
34
+ downsample_full_train = 2
35
+
36
+ # if model_selection_loop=True get OOS performance for training_data
37
+ # and use that to select best model
38
+ # if model_selection_loop=False, just predict on tournament data using existing models and model config
39
+ model_selection_loop = True
40
+ model_config_name = "advanced_example_model"
41
+
42
+ napi = NumerAPI()
43
+
44
+ current_round = napi.get_current_round()
45
+
46
+ Path("./v4").mkdir(parents=False, exist_ok=True)
47
+ napi.download_dataset("v4/train.parquet")
48
+ napi.download_dataset("v4/features.json")
49
+
50
+
51
+ print("Entering model selection loop. This may take awhile.")
52
+ if model_selection_loop:
53
+ model_config = {}
54
+ print('reading training_data')
55
+ training_data = pd.read_parquet('v4/train.parquet')
56
+
57
+ # keep track of some prediction columns
58
+ ensemble_cols = set()
59
+ pred_cols = set()
60
+
61
+ # pick some targets to use
62
+ possible_targets = [c for c in training_data.columns if c.startswith("target_")]
63
+ # randomly pick a handful of targets
64
+ # this can be vastly improved
65
+ targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
66
+
67
+ # all the possible features to train on
68
+ feature_cols = [c for c in training_data if c.startswith("feature_")]
69
+
70
+ """ do cross val to get out of sample training preds"""
71
+ cv = 3
72
+ train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
73
+ # get out of sample training preds via embargoed time series cross validation
74
+ # optionally downsample training data to speed up this section.
75
+ print("entering time series cross validation loop")
76
+ for split, train_test_split in enumerate(train_test_zip):
77
+ gc.collect()
78
+ print(f"doing split {split+1} out of {cv}")
79
+ train_split, test_split = train_test_split
80
+ train_split_index = training_data[ERA_COL].isin(train_split)
81
+ test_split_index = training_data[ERA_COL].isin(test_split)
82
+ downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]
83
+
84
+ # getting the per era correlation of each feature vs the primary target across the training split
85
+ print("getting feature correlations over time and identifying riskiest features")
86
+ all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
87
+ lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
88
+ # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
89
+ # there are probably more clever ways to do this
90
+ riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
91
+
92
+ print(f"entering model training loop for split {split+1}")
93
+ for target in targets:
94
+ model_name = f"model_{target}"
95
+ print(f"model: {model_name}")
96
+
97
+ # train a model on the training split (and save it for future use)
98
+ split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
99
+ split_model = load_model(split_model_name)
100
+ if not split_model:
101
+ print(f"training model: {model_name}")
102
+ split_model = LGBMRegressor(**model_params)
103
+ split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
104
+ training_data.loc[downsampled_train_split_index,
105
+ [target]])
106
+ save_model(split_model, split_model_name)
107
+ # now we can predict on the test part of the split
108
+ model_expected_features = split_model.booster_.feature_name()
109
+ if set(model_expected_features) != set(feature_cols):
110
+ print(f"New features are available! Might want to retrain model {split_model_name}.")
111
+ print(f"predicting {model_name}")
112
+ training_data.loc[test_split_index, f"preds_{model_name}"] = \
113
+ split_model.predict(training_data.loc[test_split_index, model_expected_features])
114
+
115
+ # do neutralization
116
+ print("doing neutralization to riskiest features")
117
+ training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
118
+ df=training_data.loc[test_split_index, :],
119
+ columns=[f"preds_{model_name}"],
120
+ neutralizers=riskiest_features_split,
121
+ proportion=1.0,
122
+ normalize=True,
123
+ era_col=ERA_COL)[f"preds_{model_name}"]
124
+
125
+ # remember that we made all of these different pred columns
126
+ pred_cols.add(f"preds_{model_name}")
127
+ pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
128
+
129
+ print("creating ensembles")
130
+ # ranking per era for all of our pred cols so we can combine safely on the same scales
131
+ training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
132
+ lambda d: d[list(pred_cols)].rank(pct=True))
133
+ # do ensembles
134
+ training_data["ensemble_neutral_riskiest_50"] = sum(
135
+ [training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
136
+ pct=True)
137
+ training_data["ensemble_not_neutral"] = sum(
138
+ [training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
139
+ training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
140
+
141
+ ensemble_cols.add("ensemble_neutral_riskiest_50")
142
+ ensemble_cols.add("ensemble_not_neutral")
143
+ ensemble_cols.add("ensemble_all")
144
+
145
+ """ Now get some stats and pick our favorite model"""
146
+ print("gathering validation metrics for out of sample training results")
147
+ all_model_cols = list(pred_cols) + list(ensemble_cols)
148
+ # use example_col preds_model_target as an estimates since no example preds provided for training
149
+ # fast_mode=True so that we skip some of the stats that are slower to calculate
150
+ training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
151
+ fast_mode=True, target_col=TARGET_COL)
152
+ print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
153
+
154
+ # pick the model that has the highest correlation sharpe
155
+ best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
156
+ print(f"selecting model {best_pred_col} as our highest sharpe model in validation")
157
+
158
+ """ Now do a full train"""
159
+ print("entering full training section")
160
+ # getting the per era correlation of each feature vs the target across all of training data
161
+ print("getting feature correlations with target and identifying riskiest features")
162
+ all_feature_corrs = training_data.groupby(ERA_COL).apply(
163
+ lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
164
+ # find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
165
+ riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
166
+
167
+ for target in targets:
168
+ gc.collect()
169
+ model_name = f"model_{target}_downsample{downsample_full_train}"
170
+ model = load_model(model_name)
171
+ if not model:
172
+ print(f"training {model_name}")
173
+ model = LGBMRegressor(**model_params)
174
+ # train on all of train, predict on val, predict on tournament
175
+ model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
176
+ training_data.iloc[::downsample_full_train][target])
177
+ save_model(model, model_name)
178
+ gc.collect()
179
+
180
+ model_config["feature_cols"] = feature_cols
181
+ model_config["targets"] = targets
182
+ model_config["best_pred_col"] = best_pred_col
183
+ model_config["riskiest_features"] = riskiest_features
184
+ print(f"saving model config for {model_config_name}")
185
+ save_model_config(model_config, model_config_name)
186
+ else:
187
+ # load model config from previous model selection loop
188
+ print(f"loading model config for {model_config_name}")
189
+ model_config = load_model_config(model_config_name)
190
+ feature_cols = model_config["feature_cols"]
191
+ targets = model_config["targets"]
192
+ best_pred_col = model_config["best_pred_col"]
193
+ riskiest_features = model_config["riskiest_features"]
194
+
195
+
196
+ """ Things that we always do even if we've already trained """
197
+ gc.collect()
198
+
199
+ print("reading tournament_data")
200
+ live_data = pd.read_parquet('v4/live.parquet')
201
+ print("reading validation_data")
202
+ validation_data = pd.read_parquet('v4/validation.parquet')
203
+ print("reading example_predictions")
204
+ example_preds = pd.read_parquet('v4/live_example_preds.parquet')
205
+ print("reading example_validaton_predictions")
206
+ validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
207
+ # set the example predictions
208
+ validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
209
+
210
+ # check for nans and fill nans
211
+ print("checking for nans in the tournament data")
212
+ if live_data.loc[:, feature_cols].isna().sum().sum():
213
+ cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
214
+ total_rows = len(live_data)
215
+ print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
216
+ print(f"out of {total_rows} total rows")
217
+ print(f"filling nans with 0.5")
218
+ live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
219
+
220
+ else:
221
+ print("No nans in the features this week!")
222
+
223
+
224
+ pred_cols = set()
225
+ ensemble_cols = set()
226
+ for target in targets:
227
+ gc.collect()
228
+ model_name = f"model_{target}_downsample{downsample_full_train}"
229
+ print(f"loading {model_name}")
230
+ model = load_model(model_name)
231
+ if not model:
232
+ raise ValueError(f"{model_name} is not trained yet!")
233
+
234
+ model_expected_features = model.booster_.feature_name()
235
+ if set(model_expected_features) != set(feature_cols):
236
+ print(f"New features are available! Might want to retrain model {model_name}.")
237
+ print(f"predicting tournament and validation for {model_name}")
238
+ validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
239
+ live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
240
+
241
+ # do different neutralizations
242
+ # neutralize our predictions to the riskiest features only
243
+ print("neutralizing to riskiest_50 for validation and tournament")
244
+ validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
245
+ columns=[f"preds_{model_name}"],
246
+ neutralizers=riskiest_features,
247
+ proportion=1.0,
248
+ normalize=True,
249
+ era_col=ERA_COL)[f"preds_{model_name}"]
250
+ live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
251
+ columns=[f"preds_{model_name}"],
252
+ neutralizers=riskiest_features,
253
+ proportion=1.0,
254
+ normalize=True,
255
+ era_col=ERA_COL)[f"preds_{model_name}"]
256
+
257
+ pred_cols.add(f"preds_{model_name}")
258
+ pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
259
+
260
+
261
+ # rank per era for each prediction column so that we can combine safely
262
+ validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
263
+ live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
264
+ # make ensembles for val and tournament
265
+ print('creating ensembles for tournament and validation')
266
+ validation_data["ensemble_neutral_riskiest_50"] = sum(
267
+ [validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
268
+ pct=True)
269
+ live_data["ensemble_neutral_riskiest_50"] = sum(
270
+ [live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
271
+ pct=True)
272
+ ensemble_cols.add("ensemble_neutral_riskiest_50")
273
+
274
+ validation_data["ensemble_not_neutral"] = sum(
275
+ [validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
276
+ live_data["ensemble_not_neutral"] = sum(
277
+ [live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
278
+ ensemble_cols.add("ensemble_not_neutral")
279
+
280
+ validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
281
+ live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
282
+
283
+ ensemble_cols.add("ensemble_all")
284
+
285
+ gc.collect()
286
+ print("getting final validation stats")
287
+ # get our final validation stats for our chosen model
288
+ validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
289
+ fast_mode=False, target_col=TARGET_COL)
290
+ print(validation_stats.to_markdown())
291
+
292
+ # rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
293
+ validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
294
+ live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
295
+ save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
296
+ save_prediction(live_data["prediction"], f"live_data_{current_round}")
utils.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import scipy
4
+ from halo import Halo
5
+ from pathlib import Path
6
+ import json
7
+ from scipy.stats import skew
8
+
9
+ ERA_COL = "era"
10
+ TARGET_COL = "target_nomi_v4_20"
11
+ DATA_TYPE_COL = "data_type"
12
+ EXAMPLE_PREDS_COL = "example_preds"
13
+
14
+ spinner = Halo(text='', spinner='dots')
15
+
16
+ MODEL_FOLDER = "models"
17
+ MODEL_CONFIGS_FOLDER = "model_configs"
18
+ PREDICTION_FILES_FOLDER = "prediction_files"
19
+
20
+
21
+ def save_prediction(df, name):
22
+ try:
23
+ Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
24
+ except Exception as ex:
25
+ pass
26
+ df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
27
+
28
+
29
+ def save_model(model, name):
30
+ try:
31
+ Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
32
+ except Exception as ex:
33
+ pass
34
+ pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
35
+
36
+
37
+ def load_model(name):
38
+ path = Path(f"{MODEL_FOLDER}/{name}.pkl")
39
+ if path.is_file():
40
+ model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
41
+ else:
42
+ model = False
43
+ return model
44
+
45
+
46
+ def save_model_config(model_config, model_name):
47
+ try:
48
+ Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
49
+ except Exception as ex:
50
+ pass
51
+ with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp:
52
+ json.dump(model_config, fp)
53
+
54
+
55
+ def load_model_config(model_name):
56
+ path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
57
+ path = Path(path_str)
58
+ if path.is_file():
59
+ with open(path_str, 'r') as fp:
60
+ model_config = json.load(fp)
61
+ else:
62
+ model_config = False
63
+ return model_config
64
+
65
+
66
+ def get_biggest_change_features(corrs, n):
67
+ all_eras = corrs.index.sort_values()
68
+ h1_eras = all_eras[:len(all_eras) // 2]
69
+ h2_eras = all_eras[len(all_eras) // 2:]
70
+
71
+ h1_corr_means = corrs.loc[h1_eras, :].mean()
72
+ h2_corr_means = corrs.loc[h2_eras, :].mean()
73
+
74
+ corr_diffs = h2_corr_means - h1_corr_means
75
+ worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
76
+ return worst_n
77
+
78
+
79
+ def get_time_series_cross_val_splits(data, cv=3, embargo=12):
80
+ all_train_eras = data[ERA_COL].unique()
81
+ len_split = len(all_train_eras) // cv
82
+ test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
83
+ # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
84
+ remainder = len(all_train_eras) % cv
85
+ if remainder != 0:
86
+ test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
87
+
88
+ train_splits = []
89
+ for test_split in test_splits:
90
+ test_split_max = int(np.max(test_split))
91
+ test_split_min = int(np.min(test_split))
92
+ # get all of the eras that aren't in the test split
93
+ train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)]
94
+ # embargo the train split so we have no leakage.
95
+ # one era is length 5, so we need to embargo by target_length/5 eras.
96
+ # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
97
+ train_split = [e for e in train_split_not_embargoed if
98
+ abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
99
+ train_splits.append(train_split)
100
+
101
+ # convenient way to iterate over train and test splits
102
+ train_test_zip = zip(train_splits, test_splits)
103
+ return train_test_zip
104
+
105
+
106
+ def neutralize(df,
107
+ columns,
108
+ neutralizers=None,
109
+ proportion=1.0,
110
+ normalize=True,
111
+ era_col="era"):
112
+ if neutralizers is None:
113
+ neutralizers = []
114
+ unique_eras = df[era_col].unique()
115
+ computed = []
116
+ for u in unique_eras:
117
+ df_era = df[df[era_col] == u]
118
+ scores = df_era[columns].values
119
+ if normalize:
120
+ scores2 = []
121
+ for x in scores.T:
122
+ x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
123
+ x = scipy.stats.norm.ppf(x)
124
+ scores2.append(x)
125
+ scores = np.array(scores2).T
126
+ exposures = df_era[neutralizers].values
127
+
128
+ scores -= proportion * exposures.dot(
129
+ np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32)))
130
+
131
+ scores /= scores.std(ddof=0)
132
+
133
+ computed.append(scores)
134
+
135
+ return pd.DataFrame(np.concatenate(computed),
136
+ columns=columns,
137
+ index=df.index)
138
+
139
+
140
+ def neutralize_series(series, by, proportion=1.0):
141
+ scores = series.values.reshape(-1, 1)
142
+ exposures = by.values.reshape(-1, 1)
143
+
144
+ # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
145
+ exposures = np.hstack(
146
+ (exposures,
147
+ np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
148
+
149
+ correction = proportion * (exposures.dot(
150
+ np.linalg.lstsq(exposures, scores, rcond=None)[0]))
151
+ corrected_scores = scores - correction
152
+ neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
153
+ return neutralized
154
+
155
+
156
+ def unif(df):
157
+ x = (df.rank(method="first") - 0.5) / len(df)
158
+ return pd.Series(x, index=df.index)
159
+
160
+
161
+ def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None):
162
+ if features_for_neutralization is None:
163
+ features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
164
+ df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col],
165
+ features_for_neutralization)[prediction_col]
166
+ scores = df.groupby("era").apply(
167
+ lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean()
168
+ return np.mean(scores)
169
+
170
+ def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None):
171
+ if features_for_neutralization is None:
172
+ features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
173
+ temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later
174
+ temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col],
175
+ features_for_neutralization)[prediction_col]
176
+ temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort()
177
+ temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb],
178
+ temp_df_argsort.iloc[-tb:]])
179
+ temp_df_tb = temp_df.loc[temp_df_tb_idx]
180
+ tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col])
181
+ return tb_fnc
182
+
183
+
184
+ def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
185
+ unique_eras = df[era_col].unique()
186
+ computed = []
187
+ for u in unique_eras:
188
+ df_era = df[df[era_col] == u]
189
+ era_pred = np.float64(df_era[columns].values.T)
190
+ era_target = np.float64(df_era[target].values.T)
191
+
192
+ if tb is None:
193
+ ccs = np.corrcoef(era_target, era_pred)[0, 1:]
194
+ else:
195
+ tbidx = np.argsort(era_pred, axis=1)
196
+ tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
197
+ ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)]
198
+ ccs = np.array(ccs)
199
+
200
+ computed.append(ccs)
201
+
202
+ return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
203
+
204
+ def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
205
+ if feature_cols is None:
206
+ feature_cols = [c for c in df.columns if c.startswith("feature")]
207
+ u = df.loc[:, feature_cols].corrwith(df[prediction_col])
208
+ e = df.loc[:, feature_cols].corrwith(df[example_col])
209
+ return (1 - (np.dot(u,e)/np.dot(e,e)))
210
+
211
+ def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False,
212
+ target_col=TARGET_COL, features_for_neutralization=None):
213
+ validation_stats = pd.DataFrame()
214
+ feature_cols = [c for c in validation_data if c.startswith("feature_")]
215
+ for pred_col in pred_cols:
216
+ # Check the per-era correlations on the validation set (out of sample)
217
+ validation_correlations = validation_data.groupby(ERA_COL).apply(
218
+ lambda d: unif(d[pred_col]).corr(d[target_col]))
219
+
220
+ mean = validation_correlations.mean()
221
+ std = validation_correlations.std(ddof=0)
222
+ sharpe = mean / std
223
+
224
+ validation_stats.loc["mean", pred_col] = mean
225
+ validation_stats.loc["std", pred_col] = std
226
+ validation_stats.loc["sharpe", pred_col] = sharpe
227
+
228
+ rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large
229
+ min_periods=1).max()
230
+ daily_value = (validation_correlations + 1).cumprod()
231
+ max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
232
+ validation_stats.loc["max_drawdown", pred_col] = max_drawdown
233
+
234
+ payout_scores = validation_correlations.clip(-0.25, 0.25)
235
+ payout_daily_value = (payout_scores + 1).cumprod()
236
+
237
+ apy = (
238
+ (
239
+ (payout_daily_value.dropna().iloc[-1])
240
+ ** (1 / len(payout_scores))
241
+ )
242
+ ** 49 # 52 weeks of compounding minus 3 for stake compounding lag
243
+ - 1
244
+ ) * 100
245
+
246
+ validation_stats.loc["apy", pred_col] = apy
247
+
248
+ if not fast_mode:
249
+ # Check the feature exposure of your validation predictions
250
+ max_per_era = validation_data.groupby(ERA_COL).apply(
251
+ lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
252
+ max_feature_exposure = max_per_era.mean()
253
+ validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure
254
+
255
+ # Check feature neutral mean
256
+ feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col,
257
+ target_col, features_for_neutralization)
258
+ validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean
259
+
260
+ # Check TB200 feature neutral mean
261
+ tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \
262
+ get_feature_neutral_mean_tb_era(df, pred_col,
263
+ target_col, 200,
264
+ features_for_neutralization))
265
+ validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean()
266
+
267
+ # Check top and bottom 200 metrics (TB200)
268
+ tb200_validation_correlations = fast_score_by_date(
269
+ validation_data,
270
+ [pred_col],
271
+ target_col,
272
+ tb=200,
273
+ era_col=ERA_COL
274
+ )
275
+
276
+ tb200_mean = tb200_validation_correlations.mean()[pred_col]
277
+ tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
278
+ tb200_sharpe = tb200_mean / tb200_std
279
+
280
+ validation_stats.loc["tb200_mean", pred_col] = tb200_mean
281
+ validation_stats.loc["tb200_std", pred_col] = tb200_std
282
+ validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
283
+
284
+ # MMC over validation
285
+ mmc_scores = []
286
+ corr_scores = []
287
+ for _, x in validation_data.groupby(ERA_COL):
288
+ series = neutralize_series(unif(x[pred_col]), (x[example_col]))
289
+ mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2))
290
+ corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
291
+
292
+ val_mmc_mean = np.mean(mmc_scores)
293
+ val_mmc_std = np.std(mmc_scores)
294
+ corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
295
+ corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
296
+
297
+ validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
298
+ validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
299
+
300
+ # Check correlation with example predictions
301
+ per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col])))
302
+ corr_with_example_preds = per_era_corrs.mean()
303
+ validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds
304
+
305
+ #Check exposure dissimilarity per era
306
+ tdf = validation_data.groupby(ERA_COL).apply(lambda df: \
307
+ exposure_dissimilarity_per_era(df, pred_col,
308
+ example_col, feature_cols))
309
+ validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
310
+
311
+ # .transpose so that stats are columns and the model_name is the row
312
+ return validation_stats.transpose()