import numpy as np import pandas as pd import scipy from halo import Halo from pathlib import Path import json from scipy.stats import skew ERA_COL = "era" TARGET_COL = "target_nomi_v4_20" DATA_TYPE_COL = "data_type" EXAMPLE_PREDS_COL = "example_preds" spinner = Halo(text='', spinner='dots') MODEL_FOLDER = "models" MODEL_CONFIGS_FOLDER = "model_configs" PREDICTION_FILES_FOLDER = "prediction_files" def save_prediction(df, name): try: Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True) except Exception as ex: pass df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True) def save_model(model, name): try: Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True) except Exception as ex: pass pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl") def load_model(name): path = Path(f"{MODEL_FOLDER}/{name}.pkl") if path.is_file(): model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl") else: model = False return model def save_model_config(model_config, model_name): try: Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True) except Exception as ex: pass with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp: json.dump(model_config, fp) def load_model_config(model_name): path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json" path = Path(path_str) if path.is_file(): with open(path_str, 'r') as fp: model_config = json.load(fp) else: model_config = False return model_config def get_biggest_change_features(corrs, n): all_eras = corrs.index.sort_values() h1_eras = all_eras[:len(all_eras) // 2] h2_eras = all_eras[len(all_eras) // 2:] h1_corr_means = corrs.loc[h1_eras, :].mean() h2_corr_means = corrs.loc[h2_eras, :].mean() corr_diffs = h2_corr_means - h1_corr_means worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist() return worst_n def get_time_series_cross_val_splits(data, cv=3, embargo=12): all_train_eras = data[ERA_COL].unique() len_split = len(all_train_eras) // cv test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)] # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv remainder = len(all_train_eras) % cv if remainder != 0: test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:]) train_splits = [] for test_split in test_splits: test_split_max = int(np.max(test_split)) test_split_min = int(np.min(test_split)) # get all of the eras that aren't in the test split train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)] # embargo the train split so we have no leakage. # one era is length 5, so we need to embargo by target_length/5 eras. # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras. train_split = [e for e in train_split_not_embargoed if abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo] train_splits.append(train_split) # convenient way to iterate over train and test splits train_test_zip = zip(train_splits, test_splits) return train_test_zip def neutralize(df, columns, neutralizers=None, proportion=1.0, normalize=True, era_col="era"): if neutralizers is None: neutralizers = [] unique_eras = df[era_col].unique() computed = [] for u in unique_eras: df_era = df[df[era_col] == u] scores = df_era[columns].values if normalize: scores2 = [] for x in scores.T: x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x) x = scipy.stats.norm.ppf(x) scores2.append(x) scores = np.array(scores2).T exposures = df_era[neutralizers].values scores -= proportion * exposures.dot( np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32))) scores /= scores.std(ddof=0) computed.append(scores) return pd.DataFrame(np.concatenate(computed), columns=columns, index=df.index) def neutralize_series(series, by, proportion=1.0): scores = series.values.reshape(-1, 1) exposures = by.values.reshape(-1, 1) # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures exposures = np.hstack( (exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1))) correction = proportion * (exposures.dot( np.linalg.lstsq(exposures, scores, rcond=None)[0])) corrected_scores = scores - correction neutralized = pd.Series(corrected_scores.ravel(), index=series.index) return neutralized def unif(df): x = (df.rank(method="first") - 0.5) / len(df) return pd.Series(x, index=df.index) def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None): if features_for_neutralization is None: features_for_neutralization = [c for c in df.columns if c.startswith("feature")] df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col], features_for_neutralization)[prediction_col] scores = df.groupby("era").apply( lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean() return np.mean(scores) def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None): if features_for_neutralization is None: features_for_neutralization = [c for c in df.columns if c.startswith("feature")] temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col], features_for_neutralization)[prediction_col] temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort() temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb], temp_df_argsort.iloc[-tb:]]) temp_df_tb = temp_df.loc[temp_df_tb_idx] tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col]) return tb_fnc def fast_score_by_date(df, columns, target, tb=None, era_col="era"): unique_eras = df[era_col].unique() computed = [] for u in unique_eras: df_era = df[df[era_col] == u] era_pred = np.float64(df_era[columns].values.T) era_target = np.float64(df_era[target].values.T) if tb is None: ccs = np.corrcoef(era_target, era_pred)[0, 1:] else: tbidx = np.argsort(era_pred, axis=1) tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1) ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)] ccs = np.array(ccs) computed.append(ccs) return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique()) def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None): if feature_cols is None: feature_cols = [c for c in df.columns if c.startswith("feature")] u = df.loc[:, feature_cols].corrwith(df[prediction_col]) e = df.loc[:, feature_cols].corrwith(df[example_col]) return (1 - (np.dot(u,e)/np.dot(e,e))) def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False, target_col=TARGET_COL, features_for_neutralization=None): validation_stats = pd.DataFrame() feature_cols = [c for c in validation_data if c.startswith("feature_")] for pred_col in pred_cols: # Check the per-era correlations on the validation set (out of sample) validation_correlations = validation_data.groupby(ERA_COL).apply( lambda d: unif(d[pred_col]).corr(d[target_col])) mean = validation_correlations.mean() std = validation_correlations.std(ddof=0) sharpe = mean / std validation_stats.loc["mean", pred_col] = mean validation_stats.loc["std", pred_col] = std validation_stats.loc["sharpe", pred_col] = sharpe rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large min_periods=1).max() daily_value = (validation_correlations + 1).cumprod() max_drawdown = -((rolling_max - daily_value) / rolling_max).max() validation_stats.loc["max_drawdown", pred_col] = max_drawdown payout_scores = validation_correlations.clip(-0.25, 0.25) payout_daily_value = (payout_scores + 1).cumprod() apy = ( ( (payout_daily_value.dropna().iloc[-1]) ** (1 / len(payout_scores)) ) ** 49 # 52 weeks of compounding minus 3 for stake compounding lag - 1 ) * 100 validation_stats.loc["apy", pred_col] = apy if not fast_mode: # Check the feature exposure of your validation predictions max_per_era = validation_data.groupby(ERA_COL).apply( lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()) max_feature_exposure = max_per_era.mean() validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure # Check feature neutral mean feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col, target_col, features_for_neutralization) validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean # Check TB200 feature neutral mean tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \ get_feature_neutral_mean_tb_era(df, pred_col, target_col, 200, features_for_neutralization)) validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean() # Check top and bottom 200 metrics (TB200) tb200_validation_correlations = fast_score_by_date( validation_data, [pred_col], target_col, tb=200, era_col=ERA_COL ) tb200_mean = tb200_validation_correlations.mean()[pred_col] tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col] tb200_sharpe = tb200_mean / tb200_std validation_stats.loc["tb200_mean", pred_col] = tb200_mean validation_stats.loc["tb200_std", pred_col] = tb200_std validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe # MMC over validation mmc_scores = [] corr_scores = [] for _, x in validation_data.groupby(ERA_COL): series = neutralize_series(unif(x[pred_col]), (x[example_col])) mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2)) corr_scores.append(unif(x[pred_col]).corr(x[target_col])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores) corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe # Check correlation with example predictions per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col]))) corr_with_example_preds = per_era_corrs.mean() validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds #Check exposure dissimilarity per era tdf = validation_data.groupby(ERA_COL).apply(lambda df: \ exposure_dissimilarity_per_era(df, pred_col, example_col, feature_cols)) validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean() # .transpose so that stats are columns and the model_name is the row return validation_stats.transpose()