Spaces:

Numerati
/

numerai-example-script

Runtime error

App Files Files Community

numerai-example-script / example_model_advanced.py

hellno-o

add python files from official repo

1e601a1 over 1 year ago

raw

history blame

No virus

14.8 kB

	import pandas as pd
	from lightgbm import LGBMRegressor
	import gc
	from numerapi import NumerAPI
	from pathlib import Path
	from utils import (
	save_model,
	load_model,
	neutralize,
	get_biggest_change_features,
	get_time_series_cross_val_splits,
	validation_metrics,
	load_model_config,
	save_model_config,
	save_prediction,
	TARGET_COL,
	)


	EXAMPLE_PREDS_COL = "example_preds"
	ERA_COL = "era"
	# params we'll use to train all of our models.
	# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
	model_params = {"n_estimators": 2000,
	"learning_rate": 0.01,
	"max_depth": 5,
	"num_leaves": 2 ** 5,
	"colsample_bytree": 0.1}

	# the amount of downsampling we'll use to speed up cross validation and full train.
	# a value of 1 means no downsampling
	# a value of 10 means use every 10th row
	downsample_cross_val = 20
	downsample_full_train = 2

	# if model_selection_loop=True get OOS performance for training_data
	# and use that to select best model
	# if model_selection_loop=False, just predict on tournament data using existing models and model config
	model_selection_loop = True
	model_config_name = "advanced_example_model"

	napi = NumerAPI()

	current_round = napi.get_current_round()

	Path("./v4").mkdir(parents=False, exist_ok=True)
	napi.download_dataset("v4/train.parquet")
	napi.download_dataset("v4/features.json")


	print("Entering model selection loop. This may take awhile.")
	if model_selection_loop:
	model_config = {}
	print('reading training_data')
	training_data = pd.read_parquet('v4/train.parquet')

	# keep track of some prediction columns
	ensemble_cols = set()
	pred_cols = set()

	# pick some targets to use
	possible_targets = [c for c in training_data.columns if c.startswith("target_")]
	# randomly pick a handful of targets
	# this can be vastly improved
	targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]

	# all the possible features to train on
	feature_cols = [c for c in training_data if c.startswith("feature_")]

	""" do cross val to get out of sample training preds"""
	cv = 3
	train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
	# get out of sample training preds via embargoed time series cross validation
	# optionally downsample training data to speed up this section.
	print("entering time series cross validation loop")
	for split, train_test_split in enumerate(train_test_zip):
	gc.collect()
	print(f"doing split {split+1} out of {cv}")
	train_split, test_split = train_test_split
	train_split_index = training_data[ERA_COL].isin(train_split)
	test_split_index = training_data[ERA_COL].isin(test_split)
	downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]

	# getting the per era correlation of each feature vs the primary target across the training split
	print("getting feature correlations over time and identifying riskiest features")
	all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
	lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
	# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
	# there are probably more clever ways to do this
	riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)

	print(f"entering model training loop for split {split+1}")
	for target in targets:
	model_name = f"model_{target}"
	print(f"model: {model_name}")

	# train a model on the training split (and save it for future use)
	split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
	split_model = load_model(split_model_name)
	if not split_model:
	print(f"training model: {model_name}")
	split_model = LGBMRegressor(**model_params)
	split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
	training_data.loc[downsampled_train_split_index,
	[target]])
	save_model(split_model, split_model_name)
	# now we can predict on the test part of the split
	model_expected_features = split_model.booster_.feature_name()
	if set(model_expected_features) != set(feature_cols):
	print(f"New features are available! Might want to retrain model {split_model_name}.")
	print(f"predicting {model_name}")
	training_data.loc[test_split_index, f"preds_{model_name}"] = \
	split_model.predict(training_data.loc[test_split_index, model_expected_features])

	# do neutralization
	print("doing neutralization to riskiest features")
	training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
	df=training_data.loc[test_split_index, :],
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features_split,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]

	# remember that we made all of these different pred columns
	pred_cols.add(f"preds_{model_name}")
	pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")

	print("creating ensembles")
	# ranking per era for all of our pred cols so we can combine safely on the same scales
	training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
	lambda d: d[list(pred_cols)].rank(pct=True))
	# do ensembles
	training_data["ensemble_neutral_riskiest_50"] = sum(
	[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	training_data["ensemble_not_neutral"] = sum(
	[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

	ensemble_cols.add("ensemble_neutral_riskiest_50")
	ensemble_cols.add("ensemble_not_neutral")
	ensemble_cols.add("ensemble_all")

	""" Now get some stats and pick our favorite model"""
	print("gathering validation metrics for out of sample training results")
	all_model_cols = list(pred_cols) + list(ensemble_cols)
	# use example_col preds_model_target as an estimates since no example preds provided for training
	# fast_mode=True so that we skip some of the stats that are slower to calculate
	training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
	fast_mode=True, target_col=TARGET_COL)
	print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())

	# pick the model that has the highest correlation sharpe
	best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
	print(f"selecting model {best_pred_col} as our highest sharpe model in validation")

	""" Now do a full train"""
	print("entering full training section")
	# getting the per era correlation of each feature vs the target across all of training data
	print("getting feature correlations with target and identifying riskiest features")
	all_feature_corrs = training_data.groupby(ERA_COL).apply(
	lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
	# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
	riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

	for target in targets:
	gc.collect()
	model_name = f"model_{target}_downsample{downsample_full_train}"
	model = load_model(model_name)
	if not model:
	print(f"training {model_name}")
	model = LGBMRegressor(**model_params)
	# train on all of train, predict on val, predict on tournament
	model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
	training_data.iloc[::downsample_full_train][target])
	save_model(model, model_name)
	gc.collect()

	model_config["feature_cols"] = feature_cols
	model_config["targets"] = targets
	model_config["best_pred_col"] = best_pred_col
	model_config["riskiest_features"] = riskiest_features
	print(f"saving model config for {model_config_name}")
	save_model_config(model_config, model_config_name)
	else:
	# load model config from previous model selection loop
	print(f"loading model config for {model_config_name}")
	model_config = load_model_config(model_config_name)
	feature_cols = model_config["feature_cols"]
	targets = model_config["targets"]
	best_pred_col = model_config["best_pred_col"]
	riskiest_features = model_config["riskiest_features"]


	""" Things that we always do even if we've already trained """
	gc.collect()

	print("reading tournament_data")
	live_data = pd.read_parquet('v4/live.parquet')
	print("reading validation_data")
	validation_data = pd.read_parquet('v4/validation.parquet')
	print("reading example_predictions")
	example_preds = pd.read_parquet('v4/live_example_preds.parquet')
	print("reading example_validaton_predictions")
	validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
	# set the example predictions
	validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]

	# check for nans and fill nans
	print("checking for nans in the tournament data")
	if live_data.loc[:, feature_cols].isna().sum().sum():
	cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
	total_rows = len(live_data)
	print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
	print(f"out of {total_rows} total rows")
	print(f"filling nans with 0.5")
	live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)

	else:
	print("No nans in the features this week!")


	pred_cols = set()
	ensemble_cols = set()
	for target in targets:
	gc.collect()
	model_name = f"model_{target}_downsample{downsample_full_train}"
	print(f"loading {model_name}")
	model = load_model(model_name)
	if not model:
	raise ValueError(f"{model_name} is not trained yet!")

	model_expected_features = model.booster_.feature_name()
	if set(model_expected_features) != set(feature_cols):
	print(f"New features are available! Might want to retrain model {model_name}.")
	print(f"predicting tournament and validation for {model_name}")
	validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
	live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])

	# do different neutralizations
	# neutralize our predictions to the riskiest features only
	print("neutralizing to riskiest_50 for validation and tournament")
	validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]
	live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]

	pred_cols.add(f"preds_{model_name}")
	pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")


	# rank per era for each prediction column so that we can combine safely
	validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
	live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
	# make ensembles for val and tournament
	print('creating ensembles for tournament and validation')
	validation_data["ensemble_neutral_riskiest_50"] = sum(
	[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	live_data["ensemble_neutral_riskiest_50"] = sum(
	[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	ensemble_cols.add("ensemble_neutral_riskiest_50")

	validation_data["ensemble_not_neutral"] = sum(
	[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	live_data["ensemble_not_neutral"] = sum(
	[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	ensemble_cols.add("ensemble_not_neutral")

	validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
	live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

	ensemble_cols.add("ensemble_all")

	gc.collect()
	print("getting final validation stats")
	# get our final validation stats for our chosen model
	validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
	fast_mode=False, target_col=TARGET_COL)
	print(validation_stats.to_markdown())

	# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
	validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
	live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
	save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
	save_prediction(live_data["prediction"], f"live_data_{current_round}")