import streamlit as st import pandas as pd import pandas_datareader as pdr import numpy as np import yfinance as yf import requests from bs4 import BeautifulSoup from typing import List from tqdm import tqdm import os import datetime from pandas.tseries.offsets import BDay from datasets import load_dataset import lightgbm as lgb from sklearn.model_selection import TimeSeriesSplit from intraCols import model_cols def walk_forward_validation(df, target_column, num_periods, mode='full'): df = df[model_cols + [target_column]] df[target_column] = df[target_column].astype(bool) tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want if mode == 'full': overall_results = [] # Iterate over the rows in the DataFrame, one step at a time # Split the time series data using TimeSeriesSplit for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits): # Extract the training and testing data for the current split X_train = df.drop(target_column, axis=1).iloc[train_index] y_train = df[target_column].iloc[train_index] X_test = df.drop(target_column, axis=1).iloc[test_index] y_test = df[target_column].iloc[test_index] y_train = y_train.astype(bool) model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) model.fit(X_train, y_train) # Make a prediction on the test data predictions = model.predict_proba(X_test)[:,-1] # Create a DataFrame to store the true and predicted values result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index) overall_results.append(result_df) df_results = pd.concat(overall_results) # Calibrate Probabilities def get_quantiles(df, col_name, q): return df.groupby(pd.cut(df[col_name], q))['True'].mean() greenprobas = [] for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)): try: df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7) for q in df_q.index: if q.left <= pct <= q.right: p = df_q[q] except: p = None greenprobas.append(p) df_results['CalibPredicted'] = greenprobas return df_results, model elif mode == 'single': X_train = df.drop(target_column, axis=1).iloc[:-1] y_train = df[target_column].iloc[:-1] X_test = df.drop(target_column, axis=1).iloc[-1] y_test = df[target_column].iloc[-1] y_train = y_train.astype(bool) model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1) model.fit(X_train, y_train) predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1] result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]]) return result_df, model def seq_predict_proba(df, trained_clf_model): clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1] return clf_pred_proba