File size: 3,326 Bytes
86fa8c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acdf71a
86fa8c7
 
 
 
 
 
acdf71a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86fa8c7
acdf71a
 
 
86fa8c7
acdf71a
 
 
 
 
 
 
 
 
86fa8c7
acdf71a
86fa8c7
acdf71a
86fa8c7
acdf71a
86fa8c7
acdf71a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86fa8c7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import pandas as pd
import pandas_datareader as pdr
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from typing import List
from tqdm import tqdm
import os
import datetime
from pandas.tseries.offsets import BDay
from datasets import load_dataset
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from intraCols import model_cols

def walk_forward_validation(df, target_column, num_periods, mode='full'):
    
    df = df[model_cols + [target_column]]
    df[target_column] = df[target_column].astype(bool)

    tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods)  # num_splits is the number of splits you want

    if mode == 'full':
        overall_results = []
        # Iterate over the rows in the DataFrame, one step at a time
        # Split the time series data using TimeSeriesSplit
        for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
            # Extract the training and testing data for the current split
            X_train = df.drop(target_column, axis=1).iloc[train_index]
            y_train = df[target_column].iloc[train_index]
            X_test = df.drop(target_column, axis=1).iloc[test_index]
            y_test = df[target_column].iloc[test_index]
        
            y_train = y_train.astype(bool)
            model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
            model.fit(X_train, y_train)
            # Make a prediction on the test data
            predictions = model.predict_proba(X_test)[:,-1]
                
            # Create a DataFrame to store the true and predicted values
            result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
            overall_results.append(result_df)
        df_results = pd.concat(overall_results)

        # Calibrate Probabilities
        def get_quantiles(df, col_name, q):
            return df.groupby(pd.cut(df[col_name], q))['True'].mean()

        greenprobas = []
        for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
            try:
                df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
                for q in df_q.index:
                    if q.left <= pct <= q.right:
                        p = df_q[q]
            except:
                p = None

            greenprobas.append(p)

        df_results['CalibPredicted'] = greenprobas

        return df_results, model

    elif mode == 'single':
        X_train = df.drop(target_column, axis=1).iloc[:-1]
        y_train = df[target_column].iloc[:-1]
        X_test = df.drop(target_column, axis=1).iloc[-1]
        y_test = df[target_column].iloc[-1]
        y_train = y_train.astype(bool)
        model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
        model.fit(X_train, y_train)
        predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
        result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])

        return result_df, model
        

    

def seq_predict_proba(df, trained_clf_model):
    clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
    return clf_pred_proba