Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	File size: 3,212 Bytes
			
			| 8cf4695 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import math
import pandas as pd
from sktime.forecasting.base import ForecastingHorizon
def split_x_y(
    data: pd.DataFrame,
    window_length: int,
    n_predict: int,
    freq: str,
):
    # print('[prep_data] ----- Start -----')
    datetime_index = data.index
    y = data['y']
    X_train, X_forecast = None, None
    has_X = len(data.columns) > 1
    if has_X:
        # print('[prep_data] - additional feature columns found')
        X = data.drop(columns='y').reset_index(drop=True)
        X_columns = X.columns
        X_train = pd.DataFrame()
        # ------------------------ #
        # Build lags of the X data #
        # ------------------------ #
        # print('[prep_data] - Building lags of features data')
        for n in range(0, window_length):
            # print('[prep_data],', n)
            shifted_columns = {}
            for col in X_columns:
                shifted_columns[col] = f'{col}_-{n_predict + n}'
            shifted = X.shift(n).rename(columns=shifted_columns)
            X_train = pd.concat(
                [X_train, shifted],
                axis=1)
            # print('[prep_data],', X_train)
        # print('[prep_data] - Backward fill lags of exog data')
        X_train = X_train.bfill()
        # Split last n_predict rows from exog_train as exog_pred
        X_forecast = X_train[-n_predict:]
        X_train = X_train[:-n_predict]
        # For both y and datetime index, need to cut off n_predict value to keep data consistent
        # print('[prep_data] - Cutting off y and datetime index be n_predict')
        y = y[n_predict:]
        datetime_index = datetime_index[n_predict:]
        X_train.set_index(datetime_index, inplace=True)
    fh = ForecastingHorizon(
        list(range(1, n_predict+1)), is_relative=True, freq=freq)
    # Cutoff is the last datetime value in the given data
    # meaning we'll forecast right after this point of time
    cutoff = datetime_index[-1]
    fh = fh.to_absolute(cutoff=cutoff)
    if X_forecast is not None:
        X_forecast.set_index(fh.to_pandas(), inplace=True)
    return (fh, y, X_train, X_forecast)
def k_folds(
        data: pd.DataFrame,
        period: int,
        window_length: int,
        n_predict: int,
        freq: str
):
    '''
    Amount of folds for testing is data size - window length and 2 seasonality period
    This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model
    '''
    print('[k_folds] ----- START -----')
    k = math.floor((len(data) - n_predict - (2*period)) / period)
    folds = []
    print('k', k)
    # Make sure k is not large than 10
    k = min(k, 10)
    if k == 0:
        raise ValueError(
            f'Data should at least have length of 2 seasons + n_predict rows,  \
                currently length {len(data)}, expected length {2 * period + n_predict}')
    for i in reversed(range(1, k + 1)):
        d = data[: (-i * period)]
        folds.append(
            split_x_y(
                d,
                window_length,
                n_predict,
                freq
            ))
    print('[k_folds] ----- END -----')
    return folds
 |