Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import scipy | |
from halo import Halo | |
from pathlib import Path | |
import json | |
from scipy.stats import skew | |
ERA_COL = "era" | |
TARGET_COL = "target_nomi_v4_20" | |
DATA_TYPE_COL = "data_type" | |
EXAMPLE_PREDS_COL = "example_preds" | |
spinner = Halo(text='', spinner='dots') | |
MODEL_FOLDER = "models" | |
MODEL_CONFIGS_FOLDER = "model_configs" | |
PREDICTION_FILES_FOLDER = "prediction_files" | |
def save_prediction(df, name): | |
try: | |
Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True) | |
except Exception as ex: | |
pass | |
df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True) | |
def save_model(model, name): | |
try: | |
Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True) | |
except Exception as ex: | |
pass | |
pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl") | |
def load_model(name): | |
path = Path(f"{MODEL_FOLDER}/{name}.pkl") | |
if path.is_file(): | |
model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl") | |
else: | |
model = False | |
return model | |
def save_model_config(model_config, model_name): | |
try: | |
Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True) | |
except Exception as ex: | |
pass | |
with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp: | |
json.dump(model_config, fp) | |
def load_model_config(model_name): | |
path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json" | |
path = Path(path_str) | |
if path.is_file(): | |
with open(path_str, 'r') as fp: | |
model_config = json.load(fp) | |
else: | |
model_config = False | |
return model_config | |
def get_biggest_change_features(corrs, n): | |
all_eras = corrs.index.sort_values() | |
h1_eras = all_eras[:len(all_eras) // 2] | |
h2_eras = all_eras[len(all_eras) // 2:] | |
h1_corr_means = corrs.loc[h1_eras, :].mean() | |
h2_corr_means = corrs.loc[h2_eras, :].mean() | |
corr_diffs = h2_corr_means - h1_corr_means | |
worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist() | |
return worst_n | |
def get_time_series_cross_val_splits(data, cv=3, embargo=12): | |
all_train_eras = data[ERA_COL].unique() | |
len_split = len(all_train_eras) // cv | |
test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)] | |
# fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv | |
remainder = len(all_train_eras) % cv | |
if remainder != 0: | |
test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:]) | |
train_splits = [] | |
for test_split in test_splits: | |
test_split_max = int(np.max(test_split)) | |
test_split_min = int(np.min(test_split)) | |
# get all of the eras that aren't in the test split | |
train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)] | |
# embargo the train split so we have no leakage. | |
# one era is length 5, so we need to embargo by target_length/5 eras. | |
# To be consistent for all targets, let's embargo everything by 60/5 == 12 eras. | |
train_split = [e for e in train_split_not_embargoed if | |
abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo] | |
train_splits.append(train_split) | |
# convenient way to iterate over train and test splits | |
train_test_zip = zip(train_splits, test_splits) | |
return train_test_zip | |
def neutralize(df, | |
columns, | |
neutralizers=None, | |
proportion=1.0, | |
normalize=True, | |
era_col="era"): | |
if neutralizers is None: | |
neutralizers = [] | |
unique_eras = df[era_col].unique() | |
computed = [] | |
for u in unique_eras: | |
df_era = df[df[era_col] == u] | |
scores = df_era[columns].values | |
if normalize: | |
scores2 = [] | |
for x in scores.T: | |
x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x) | |
x = scipy.stats.norm.ppf(x) | |
scores2.append(x) | |
scores = np.array(scores2).T | |
exposures = df_era[neutralizers].values | |
scores -= proportion * exposures.dot( | |
np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32))) | |
scores /= scores.std(ddof=0) | |
computed.append(scores) | |
return pd.DataFrame(np.concatenate(computed), | |
columns=columns, | |
index=df.index) | |
def neutralize_series(series, by, proportion=1.0): | |
scores = series.values.reshape(-1, 1) | |
exposures = by.values.reshape(-1, 1) | |
# this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures | |
exposures = np.hstack( | |
(exposures, | |
np.array([np.mean(series)] * len(exposures)).reshape(-1, 1))) | |
correction = proportion * (exposures.dot( | |
np.linalg.lstsq(exposures, scores, rcond=None)[0])) | |
corrected_scores = scores - correction | |
neutralized = pd.Series(corrected_scores.ravel(), index=series.index) | |
return neutralized | |
def unif(df): | |
x = (df.rank(method="first") - 0.5) / len(df) | |
return pd.Series(x, index=df.index) | |
def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None): | |
if features_for_neutralization is None: | |
features_for_neutralization = [c for c in df.columns if c.startswith("feature")] | |
df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col], | |
features_for_neutralization)[prediction_col] | |
scores = df.groupby("era").apply( | |
lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean() | |
return np.mean(scores) | |
def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None): | |
if features_for_neutralization is None: | |
features_for_neutralization = [c for c in df.columns if c.startswith("feature")] | |
temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later | |
temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col], | |
features_for_neutralization)[prediction_col] | |
temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort() | |
temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb], | |
temp_df_argsort.iloc[-tb:]]) | |
temp_df_tb = temp_df.loc[temp_df_tb_idx] | |
tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col]) | |
return tb_fnc | |
def fast_score_by_date(df, columns, target, tb=None, era_col="era"): | |
unique_eras = df[era_col].unique() | |
computed = [] | |
for u in unique_eras: | |
df_era = df[df[era_col] == u] | |
era_pred = np.float64(df_era[columns].values.T) | |
era_target = np.float64(df_era[target].values.T) | |
if tb is None: | |
ccs = np.corrcoef(era_target, era_pred)[0, 1:] | |
else: | |
tbidx = np.argsort(era_pred, axis=1) | |
tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1) | |
ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)] | |
ccs = np.array(ccs) | |
computed.append(ccs) | |
return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique()) | |
def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None): | |
if feature_cols is None: | |
feature_cols = [c for c in df.columns if c.startswith("feature")] | |
u = df.loc[:, feature_cols].corrwith(df[prediction_col]) | |
e = df.loc[:, feature_cols].corrwith(df[example_col]) | |
return (1 - (np.dot(u,e)/np.dot(e,e))) | |
def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False, | |
target_col=TARGET_COL, features_for_neutralization=None): | |
validation_stats = pd.DataFrame() | |
feature_cols = [c for c in validation_data if c.startswith("feature_")] | |
for pred_col in pred_cols: | |
# Check the per-era correlations on the validation set (out of sample) | |
validation_correlations = validation_data.groupby(ERA_COL).apply( | |
lambda d: unif(d[pred_col]).corr(d[target_col])) | |
mean = validation_correlations.mean() | |
std = validation_correlations.std(ddof=0) | |
sharpe = mean / std | |
validation_stats.loc["mean", pred_col] = mean | |
validation_stats.loc["std", pred_col] = std | |
validation_stats.loc["sharpe", pred_col] = sharpe | |
rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large | |
min_periods=1).max() | |
daily_value = (validation_correlations + 1).cumprod() | |
max_drawdown = -((rolling_max - daily_value) / rolling_max).max() | |
validation_stats.loc["max_drawdown", pred_col] = max_drawdown | |
payout_scores = validation_correlations.clip(-0.25, 0.25) | |
payout_daily_value = (payout_scores + 1).cumprod() | |
apy = ( | |
( | |
(payout_daily_value.dropna().iloc[-1]) | |
** (1 / len(payout_scores)) | |
) | |
** 49 # 52 weeks of compounding minus 3 for stake compounding lag | |
- 1 | |
) * 100 | |
validation_stats.loc["apy", pred_col] = apy | |
if not fast_mode: | |
# Check the feature exposure of your validation predictions | |
max_per_era = validation_data.groupby(ERA_COL).apply( | |
lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()) | |
max_feature_exposure = max_per_era.mean() | |
validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure | |
# Check feature neutral mean | |
feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col, | |
target_col, features_for_neutralization) | |
validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean | |
# Check TB200 feature neutral mean | |
tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \ | |
get_feature_neutral_mean_tb_era(df, pred_col, | |
target_col, 200, | |
features_for_neutralization)) | |
validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean() | |
# Check top and bottom 200 metrics (TB200) | |
tb200_validation_correlations = fast_score_by_date( | |
validation_data, | |
[pred_col], | |
target_col, | |
tb=200, | |
era_col=ERA_COL | |
) | |
tb200_mean = tb200_validation_correlations.mean()[pred_col] | |
tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col] | |
tb200_sharpe = tb200_mean / tb200_std | |
validation_stats.loc["tb200_mean", pred_col] = tb200_mean | |
validation_stats.loc["tb200_std", pred_col] = tb200_std | |
validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe | |
# MMC over validation | |
mmc_scores = [] | |
corr_scores = [] | |
for _, x in validation_data.groupby(ERA_COL): | |
series = neutralize_series(unif(x[pred_col]), (x[example_col])) | |
mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2)) | |
corr_scores.append(unif(x[pred_col]).corr(x[target_col])) | |
val_mmc_mean = np.mean(mmc_scores) | |
val_mmc_std = np.std(mmc_scores) | |
corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] | |
corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) | |
validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean | |
validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe | |
# Check correlation with example predictions | |
per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col]))) | |
corr_with_example_preds = per_era_corrs.mean() | |
validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds | |
#Check exposure dissimilarity per era | |
tdf = validation_data.groupby(ERA_COL).apply(lambda df: \ | |
exposure_dissimilarity_per_era(df, pred_col, | |
example_col, feature_cols)) | |
validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean() | |
# .transpose so that stats are columns and the model_name is the row | |
return validation_stats.transpose() | |