Spaces:

Numerati
/

numerai-example-script

Runtime error

App Files Files Community

hellno-o commited on Dec 10, 2022

Commit

2d90381

•

1 Parent(s): bd087ef

Add example script and requirements

Browse files

Files changed (3) hide show

app.py +170 -0
example-scripts +1 -0
requirements.txt +100 -0

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import streamlit as st
+st.title('Numerai Example Script')
+# content below from
+# https://github.com/numerai/example-scripts/blob/master/example_model.py
+#
+import pandas as pd
+from lightgbm import LGBMRegressor
+import gc
+import json
+from pathlib import Path
+from numerapi import NumerAPI
+from utils import (
+    save_model,
+    load_model,
+    neutralize,
+    get_biggest_change_features,
+    validation_metrics,
+    ERA_COL,
+    DATA_TYPE_COL,
+    TARGET_COL,
+    EXAMPLE_PREDS_COL
+)
+# download all the things
+napi = NumerAPI()
+current_round = napi.get_current_round()
+# Tournament data changes every week so we specify the round in their name. Training
+# and validation data only change periodically, so no need to download them every time.
+print('Downloading dataset files...')
+Path("./v4").mkdir(parents=False, exist_ok=True)
+napi.download_dataset("v4/train.parquet")
+napi.download_dataset("v4/validation.parquet")
+napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
+napi.download_dataset("v4/validation_example_preds.parquet")
+napi.download_dataset("v4/features.json")
+print('Reading minimal training data')
+# read the feature metadata and get a feature set (or all the features)
+with open("v4/features.json", "r") as f:
+    feature_metadata = json.load(f)
+# features = list(feature_metadata["feature_stats"].keys()) # get all the features
+# features = feature_metadata["feature_sets"]["small"] # get the small feature set
+features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
+# read in just those features along with era and target columns
+read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
+# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
+# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
+training_data = pd.read_parquet('v4/train.parquet',
+                                columns=read_columns)
+validation_data = pd.read_parquet('v4/validation.parquet',
+                                  columns=read_columns)
+live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
+                                  columns=read_columns)
+# pare down the number of eras to every 4th era
+# every_4th_era = training_data[ERA_COL].unique()[::4]
+# training_data = training_data[training_data[ERA_COL].isin(every_4th_era)]
+# getting the per era correlation of each feature vs the target
+all_feature_corrs = training_data.groupby(ERA_COL).apply(
+    lambda era: era[features].corrwith(era[TARGET_COL])
+)
+# find the riskiest features by comparing their correlation vs
+# the target in each half of training data; we'll use these later
+riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
+# "garbage collection" (gc) gets rid of unused data and frees up memory
+gc.collect()
+model_name = f"model_target"
+print(f"Checking for existing model '{model_name}'")
+model = load_model(model_name)
+if not model:
+    print(f"model not found, creating new one")
+    params = {"n_estimators": 2000,
+              "learning_rate": 0.01,
+              "max_depth": 5,
+              "num_leaves": 2 ** 5,
+              "colsample_bytree": 0.1}
+    model = LGBMRegressor(**params)
+    # train on all of train and save the model so we don't have to train next time
+    model.fit(training_data.filter(like='feature_', axis='columns'),
+              training_data[TARGET_COL])
+    print(f"saving new model: {model_name}")
+    save_model(model, model_name)
+gc.collect()
+nans_per_col = live_data[live_data["data_type"] == "live"][features].isna().sum()
+# check for nans and fill nans
+if nans_per_col.any():
+    total_rows = len(live_data[live_data["data_type"] == "live"])
+    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
+    print(f"out of {total_rows} total rows")
+    print(f"filling nans with 0.5")
+    live_data.loc[:, features] = live_data.loc[:, features].fillna(0.5)
+else:
+    print("No nans in the features this week!")
+# double check the feature that the model expects vs what is available to prevent our
+# pipeline from failing if Numerai adds more data and we don't have time to retrain!
+model_expected_features = model.booster_.feature_name()
+if set(model_expected_features) != set(features):
+    print(f"New features are available! Might want to retrain model {model_name}.")
+validation_data.loc[:, f"preds_{model_name}"] = model.predict(
+    validation_data.loc[:, model_expected_features])
+live_data.loc[:, f"preds_{model_name}"] = model.predict(
+    live_data.loc[:, model_expected_features])
+gc.collect()
+# neutralize our predictions to the riskiest features
+validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
+    df=validation_data,
+    columns=[f"preds_{model_name}"],
+    neutralizers=riskiest_features,
+    proportion=1.0,
+    normalize=True,
+    era_col=ERA_COL
+)
+live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
+    df=live_data,
+    columns=[f"preds_{model_name}"],
+    neutralizers=riskiest_features,
+    proportion=1.0,
+    normalize=True,
+    era_col=ERA_COL
+)
+model_to_submit = f"preds_{model_name}_neutral_riskiest_50"
+# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
+validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
+live_data["prediction"] = live_data[model_to_submit].rank(pct=True)
+validation_data["prediction"].to_csv(f"validation_predictions_{current_round}.csv")
+live_data["prediction"].to_csv(f"live_predictions_{current_round}.csv")
+validation_preds = pd.read_parquet('v4/validation_example_preds.parquet')
+validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
+# get some stats about each of our models to compare...
+# fast_mode=True so that we skip some of the stats that are slower to calculate
+validation_stats = validation_metrics(validation_data, [model_to_submit, f"preds_{model_name}"], example_col=EXAMPLE_PREDS_COL, fast_mode=True, target_col=TARGET_COL)
+print(validation_stats[["mean", "sharpe"]].to_markdown())
+print(f'''
+Done! Next steps:
+    1. Go to numer.ai/tournament (make sure you have an account)
+    2. Submit validation_predictions_{current_round}.csv to the diagnostics tool
+    3. Submit tournament_predictions_{current_round}.csv to the "Upload Predictions" button
+''')

example-scripts ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1

requirements.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+#
+# This file is autogenerated by pip-compile with python 3.10
+# To update, run:
+#
+#    pip-compile
+#
+certifi==2021.10.8
+    # via requests
+charset-normalizer==2.0.12
+    # via requests
+click==8.0.4
+    # via numerapi
+colorama==0.4.4
+    # via
+    #   halo
+    #   log-symbols
+cycler==0.11.0
+    # via matplotlib
+fonttools==4.31.1
+    # via matplotlib
+halo==0.0.31
+    # via -r requirements.in
+idna==3.3
+    # via requests
+joblib==1.1.0
+    # via scikit-learn
+kiwisolver==1.4.0
+    # via matplotlib
+lightgbm==3.3.2
+    # via -r requirements.in
+log-symbols==0.0.14
+    # via halo
+matplotlib==3.5.1
+    # via -r requirements.in
+numerapi==2.9.4
+    # via -r requirements.in
+numpy==1.22.3
+    # via
+    #   -r requirements.in
+    #   lightgbm
+    #   matplotlib
+    #   pandas
+    #   pyarrow
+    #   scikit-learn
+    #   scipy
+packaging==21.3
+    # via matplotlib
+pandas==1.4.1
+    # via
+    #   -r requirements.in
+    #   numerapi
+pillow==9.0.1
+    # via matplotlib
+pyarrow==7.0.0
+    # via -r requirements.in
+pyparsing==3.0.7
+    # via
+    #   matplotlib
+    #   packaging
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   numerapi
+    #   pandas
+pytz==2022.1
+    # via
+    #   numerapi
+    #   pandas
+requests==2.27.1
+    # via
+    #   -r requirements.in
+    #   numerapi
+scikit-learn==1.0.2
+    # via
+    #   -r requirements.in
+    #   lightgbm
+scipy==1.8.0
+    # via
+    #   -r requirements.in
+    #   lightgbm
+    #   scikit-learn
+six==1.16.0
+    # via
+    #   -r requirements.in
+    #   halo
+    #   python-dateutil
+spinners==0.0.24
+    # via halo
+tabulate==0.8.9
+    # via -r requirements.in
+termcolor==1.1.0
+    # via halo
+threadpoolctl==3.1.0
+    # via scikit-learn
+tqdm==4.63.0
+    # via numerapi
+urllib3==1.26.9
+    # via requests
+wheel==0.37.1
+    # via lightgbm