gamedayspx / model_intra_v2.py
boomsss's picture
cleapup
acdf71a
raw
history blame contribute delete
No virus
3.33 kB
import streamlit as st
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from typing import List
from tqdm import tqdm
import os
import datetime
from pandas.tseries.offsets import BDay
from datasets import load_dataset
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from intraCols import model_cols
def walk_forward_validation(df, target_column, num_periods, mode='full'):
df = df[model_cols + [target_column]]
df[target_column] = df[target_column].astype(bool)
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
if mode == 'full':
overall_results = []
# Iterate over the rows in the DataFrame, one step at a time
# Split the time series data using TimeSeriesSplit
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
# Extract the training and testing data for the current split
X_train = df.drop(target_column, axis=1).iloc[train_index]
y_train = df[target_column].iloc[train_index]
X_test = df.drop(target_column, axis=1).iloc[test_index]
y_test = df[target_column].iloc[test_index]
y_train = y_train.astype(bool)
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
model.fit(X_train, y_train)
# Make a prediction on the test data
predictions = model.predict_proba(X_test)[:,-1]
# Create a DataFrame to store the true and predicted values
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
overall_results.append(result_df)
df_results = pd.concat(overall_results)
# Calibrate Probabilities
def get_quantiles(df, col_name, q):
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
greenprobas = []
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
try:
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
for q in df_q.index:
if q.left <= pct <= q.right:
p = df_q[q]
except:
p = None
greenprobas.append(p)
df_results['CalibPredicted'] = greenprobas
return df_results, model
elif mode == 'single':
X_train = df.drop(target_column, axis=1).iloc[:-1]
y_train = df[target_column].iloc[:-1]
X_test = df.drop(target_column, axis=1).iloc[-1]
y_test = df[target_column].iloc[-1]
y_train = y_train.astype(bool)
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
model.fit(X_train, y_train)
predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
return result_df, model
def seq_predict_proba(df, trained_clf_model):
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
return clf_pred_proba