Spaces:

Numerati
/

numerai-example-script

Runtime error

App Files Files Community

numerai-example-script / example_model_advanced.py

hellno-o

add python files from official repo

1e601a1 almost 2 years ago

raw

history blame contribute delete

No virus

14.8 kB

	import pandas as pd
	from lightgbm import LGBMRegressor
	import gc
	from numerapi import NumerAPI
	from pathlib import Path
	from utils import (
	save_model,
	load_model,
	neutralize,
	get_biggest_change_features,
	get_time_series_cross_val_splits,
	validation_metrics,
	load_model_config,
	save_model_config,
	save_prediction,
	TARGET_COL,
	)


	EXAMPLE_PREDS_COL = "example_preds"
	ERA_COL = "era"
	# params we'll use to train all of our models.
	# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
	model_params = {"n_estimators": 2000,
	"learning_rate": 0.01,
	"max_depth": 5,
	"num_leaves": 2 ** 5,
	"colsample_bytree": 0.1}

	# the amount of downsampling we'll use to speed up cross validation and full train.
	# a value of 1 means no downsampling
	# a value of 10 means use every 10th row
	downsample_cross_val = 20
	downsample_full_train = 2

	# if model_selection_loop=True get OOS performance for training_data
	# and use that to select best model
	# if model_selection_loop=False, just predict on tournament data using existing models and model config
	model_selection_loop = True
	model_config_name = "advanced_example_model"

	napi = NumerAPI()

	current_round = napi.get_current_round()

	Path("./v4").mkdir(parents=False, exist_ok=True)
	napi.download_dataset("v4/train.parquet")
	napi.download_dataset("v4/features.json")


	print("Entering model selection loop. This may take awhile.")
	if model_selection_loop:
	model_config = {}
	print('reading training_data')
	training_data = pd.read_parquet('v4/train.parquet')

	# keep track of some prediction columns
	ensemble_cols = set()
	pred_cols = set()

	# pick some targets to use
	possible_targets = [c for c in training_data.columns if c.startswith("target_")]
	# randomly pick a handful of targets
	# this can be vastly improved
	targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]

	# all the possible features to train on
	feature_cols = [c for c in training_data if c.startswith("feature_")]

	""" do cross val to get out of sample training preds"""
	cv = 3
	train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
	# get out of sample training preds via embargoed time series cross validation
	# optionally downsample training data to speed up this section.
	print("entering time series cross validation loop")
	for split, train_test_split in enumerate(train_test_zip):
	gc.collect()
	print(f"doing split {split+1} out of {cv}")
	train_split, test_split = train_test_split
	train_split_index = training_data[ERA_COL].isin(train_split)
	test_split_index = training_data[ERA_COL].isin(test_split)
	downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]

	# getting the per era correlation of each feature vs the primary target across the training split
	print("getting feature correlations over time and identifying riskiest features")
	all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
	lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
	# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
	# there are probably more clever ways to do this
	riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)

	print(f"entering model training loop for split {split+1}")
	for target in targets:
	model_name = f"model_{target}"
	print(f"model: {model_name}")

	# train a model on the training split (and save it for future use)
	split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
	split_model = load_model(split_model_name)
	if not split_model:
	print(f"training model: {model_name}")
	split_model = LGBMRegressor(**model_params)
	split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
	training_data.loc[downsampled_train_split_index,
	[target]])
	save_model(split_model, split_model_name)
	# now we can predict on the test part of the split
	model_expected_features = split_model.booster_.feature_name()
	if set(model_expected_features) != set(feature_cols):
	print(f"New features are available! Might want to retrain model {split_model_name}.")
	print(f"predicting {model_name}")
	training_data.loc[test_split_index, f"preds_{model_name}"] = \
	split_model.predict(training_data.loc[test_split_index, model_expected_features])

	# do neutralization
	print("doing neutralization to riskiest features")
	training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
	df=training_data.loc[test_split_index, :],
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features_split,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]

	# remember that we made all of these different pred columns
	pred_cols.add(f"preds_{model_name}")
	pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")

	print("creating ensembles")
	# ranking per era for all of our pred cols so we can combine safely on the same scales
	training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
	lambda d: d[list(pred_cols)].rank(pct=True))
	# do ensembles
	training_data["ensemble_neutral_riskiest_50"] = sum(
	[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	training_data["ensemble_not_neutral"] = sum(
	[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

	ensemble_cols.add("ensemble_neutral_riskiest_50")
	ensemble_cols.add("ensemble_not_neutral")
	ensemble_cols.add("ensemble_all")

	""" Now get some stats and pick our favorite model"""
	print("gathering validation metrics for out of sample training results")
	all_model_cols = list(pred_cols) + list(ensemble_cols)
	# use example_col preds_model_target as an estimates since no example preds provided for training
	# fast_mode=True so that we skip some of the stats that are slower to calculate
	training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
	fast_mode=True, target_col=TARGET_COL)
	print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())

	# pick the model that has the highest correlation sharpe
	best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
	print(f"selecting model {best_pred_col} as our highest sharpe model in validation")

	""" Now do a full train"""
	print("entering full training section")
	# getting the per era correlation of each feature vs the target across all of training data
	print("getting feature correlations with target and identifying riskiest features")
	all_feature_corrs = training_data.groupby(ERA_COL).apply(
	lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
	# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
	riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

	for target in targets:
	gc.collect()
	model_name = f"model_{target}_downsample{downsample_full_train}"
	model = load_model(model_name)
	if not model:
	print(f"training {model_name}")
	model = LGBMRegressor(**model_params)
	# train on all of train, predict on val, predict on tournament
	model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
	training_data.iloc[::downsample_full_train][target])
	save_model(model, model_name)
	gc.collect()

	model_config["feature_cols"] = feature_cols
	model_config["targets"] = targets
	model_config["best_pred_col"] = best_pred_col
	model_config["riskiest_features"] = riskiest_features
	print(f"saving model config for {model_config_name}")
	save_model_config(model_config, model_config_name)
	else:
	# load model config from previous model selection loop
	print(f"loading model config for {model_config_name}")
	model_config = load_model_config(model_config_name)
	feature_cols = model_config["feature_cols"]
	targets = model_config["targets"]
	best_pred_col = model_config["best_pred_col"]
	riskiest_features = model_config["riskiest_features"]


	""" Things that we always do even if we've already trained """
	gc.collect()

	print("reading tournament_data")
	live_data = pd.read_parquet('v4/live.parquet')
	print("reading validation_data")
	validation_data = pd.read_parquet('v4/validation.parquet')
	print("reading example_predictions")
	example_preds = pd.read_parquet('v4/live_example_preds.parquet')
	print("reading example_validaton_predictions")
	validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
	# set the example predictions
	validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]

	# check for nans and fill nans
	print("checking for nans in the tournament data")
	if live_data.loc[:, feature_cols].isna().sum().sum():
	cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
	total_rows = len(live_data)
	print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
	print(f"out of {total_rows} total rows")
	print(f"filling nans with 0.5")
	live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)

	else:
	print("No nans in the features this week!")


	pred_cols = set()
	ensemble_cols = set()
	for target in targets:
	gc.collect()
	model_name = f"model_{target}_downsample{downsample_full_train}"
	print(f"loading {model_name}")
	model = load_model(model_name)
	if not model:
	raise ValueError(f"{model_name} is not trained yet!")

	model_expected_features = model.booster_.feature_name()
	if set(model_expected_features) != set(feature_cols):
	print(f"New features are available! Might want to retrain model {model_name}.")
	print(f"predicting tournament and validation for {model_name}")
	validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
	live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])

	# do different neutralizations
	# neutralize our predictions to the riskiest features only
	print("neutralizing to riskiest_50 for validation and tournament")
	validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]
	live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
	columns=[f"preds_{model_name}"],
	neutralizers=riskiest_features,
	proportion=1.0,
	normalize=True,
	era_col=ERA_COL)[f"preds_{model_name}"]

	pred_cols.add(f"preds_{model_name}")
	pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")


	# rank per era for each prediction column so that we can combine safely
	validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
	live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
	# make ensembles for val and tournament
	print('creating ensembles for tournament and validation')
	validation_data["ensemble_neutral_riskiest_50"] = sum(
	[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	live_data["ensemble_neutral_riskiest_50"] = sum(
	[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
	pct=True)
	ensemble_cols.add("ensemble_neutral_riskiest_50")

	validation_data["ensemble_not_neutral"] = sum(
	[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	live_data["ensemble_not_neutral"] = sum(
	[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
	ensemble_cols.add("ensemble_not_neutral")

	validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
	live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)

	ensemble_cols.add("ensemble_all")

	gc.collect()
	print("getting final validation stats")
	# get our final validation stats for our chosen model
	validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
	fast_mode=False, target_col=TARGET_COL)
	print(validation_stats.to_markdown())

	# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
	validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
	live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
	save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
	save_prediction(live_data["prediction"], f"live_data_{current_round}")