File size: 12,526 Bytes
6e8eb41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
import os
import tempfile
import pandas as pd
import warnings
import torch
import numpy as np
from transformers import Trainer, TrainingArguments, set_seed

from tsfm_public import TimeSeriesPreprocessor
from tsfm_public.models.tinytimemixer import TinyTimeMixerForPrediction
from tsfm_public.toolkit.get_model import get_model
from tsfm_public.toolkit.dataset import ForecastDFDataset

# Constants
SEED = 42
TTM_MODEL_PATH = "iaravagni/ttm-finetuned-model"
CONTEXT_LENGTH = 52  # 4.33 hrs
PREDICTION_LENGTH = 6  # 30 mins
OUT_DIR = "ttm_finetuned_models/"

def setup_environment():
    """
    Set up the environment for model training and evaluation.
    Creates necessary directories and sets random seed for reproducibility.
    """
    set_seed(SEED)
    os.makedirs(OUT_DIR, exist_ok=True)
    os.makedirs(os.path.join(OUT_DIR, 'results'), exist_ok=True)
    
def load_dataset(file_path):
    """
    Load the dataset from the specified file path.
    
    Args:
        file_path (str): Path to the dataset CSV file
        
    Returns:
        pd.DataFrame: The loaded dataset
    """
    return pd.read_csv(file_path)

def prepare_data(data, timestamp_column):
    """
    Prepare the dataset by converting timestamp column to datetime format.
    
    Args:
        data (pd.DataFrame): The dataset
        timestamp_column (str): Name of the timestamp column
        
    Returns:
        pd.DataFrame: The processed dataset
    """
    data[timestamp_column] = pd.to_datetime(data[timestamp_column])
    return data

def get_column_specs():
    """
    Define and return column specifications for the dataset.
    
    Returns:
        dict: Column specifications including timestamp, ID, target, and control columns
    """
    timestamp_column = "Timestamp"
    id_columns = ["patient_id"]
    target_columns = ["Glucose"]
    control_columns = ["Accelerometer", "Calories", "Carbs", "Sugar", "Gender", "HbA1c", "Age"]
    
    return {
        "timestamp_column": timestamp_column,
        "id_columns": id_columns,
        "target_columns": target_columns,
        "control_columns": control_columns,
    }

def create_test_only_dataset(ts_preprocessor, test_dataset, train_dataset=None, stride=1, enable_padding=True, **dataset_kwargs):
    """
    Creates a preprocessed pytorch dataset for testing only.
    
    Args:
        ts_preprocessor: TimeSeriesPreprocessor instance
        test_dataset: Pandas dataframe for testing
        train_dataset: Optional pandas dataframe for training the scaler
        stride: Stride used for creating the dataset
        enable_padding: If True, datasets are created with padding
        dataset_kwargs: Additional keyword arguments to pass to ForecastDFDataset
    
    Returns:
        ForecastDFDataset for testing
    """
    # Standardize the test dataframe
    test_data = ts_preprocessor._standardize_dataframe(test_dataset)
    
    # Train the preprocessor on the training data if provided, otherwise use test data
    if train_dataset is not None:
        train_data = ts_preprocessor._standardize_dataframe(train_dataset)
        ts_preprocessor.train(train_data)
    else:
        ts_preprocessor.train(test_data)
    
    # Preprocess the test data
    test_data_prep = test_data.copy()  # Skip preprocessing to avoid scaling errors
    
    # Specify columns
    column_specifiers = {
        "id_columns": ts_preprocessor.id_columns,
        "timestamp_column": ts_preprocessor.timestamp_column,
        "target_columns": ts_preprocessor.target_columns,
        "observable_columns": ts_preprocessor.observable_columns,
        "control_columns": ts_preprocessor.control_columns,
        "conditional_columns": ts_preprocessor.conditional_columns,
        "categorical_columns": ts_preprocessor.categorical_columns,
        "static_categorical_columns": ts_preprocessor.static_categorical_columns,
    }
    
    params = column_specifiers
    params["context_length"] = ts_preprocessor.context_length
    params["prediction_length"] = ts_preprocessor.prediction_length
    params["stride"] = stride
    params["enable_padding"] = enable_padding
    
    # Add frequency token - this is critical for TinyTimeMixer
    params["frequency_token"] = ts_preprocessor.get_frequency_token(ts_preprocessor.freq)
    
    # Update with any additional kwargs
    params.update(**dataset_kwargs)
    
    # Create the ForecastDFDataset
    test_dataset = ForecastDFDataset(test_data_prep, **params)
    
    if len(test_dataset) == 0:
        raise RuntimeError("The generated test dataset is of zero length.")
    
    return test_dataset

    

def zeroshot_eval(train_df, test_df, batch_size, context_length=CONTEXT_LENGTH, forecast_length=PREDICTION_LENGTH, model_path=TTM_MODEL_PATH):
    """
    Performs zero-shot evaluation of time series forecasting on test data.
    
    Args:
        train_df: Training dataframe
        test_df: Testing dataframe
        batch_size: Batch size for evaluation
        context_length: Number of time steps to use as context
        forecast_length: Number of time steps to predict
        
    Returns:
        dict: Dictionary containing predictions dataframe and metrics
    """
    column_specifiers = get_column_specs()
    
    # Create preprocessor with scaling disabled
    tsp = TimeSeriesPreprocessor(
        timestamp_column=column_specifiers["timestamp_column"],
        id_columns=column_specifiers["id_columns"],
        target_columns=column_specifiers["target_columns"],
        control_columns=column_specifiers["control_columns"],
        context_length=context_length,
        prediction_length=forecast_length,
        scaling=False,
        encode_categorical=False,
        force_return="zeropad",
    )
    
    # Load model
    zeroshot_model = get_model(
        model_path,
        context_length=context_length,
        prediction_length=forecast_length,
        freq_prefix_tuning=False,
        freq=None,
        prefer_l1_loss=False,
        prefer_longer_context=True,
    )
    
    # Create test dataset
    dset_test = create_test_only_dataset(ts_preprocessor=tsp, test_dataset=test_df, train_dataset=train_df)
    
    # Setup trainer
    temp_dir = tempfile.mkdtemp()
    zeroshot_trainer = Trainer(
        model=zeroshot_model,
        args=TrainingArguments(
            output_dir=temp_dir,
            per_device_eval_batch_size=batch_size,
            seed=SEED,
            report_to="none",
        ),
    )
    
    # Get predictions
    predictions_dict = zeroshot_trainer.predict(dset_test)
    
    # Process predictions
    processed_predictions = process_predictions(predictions_dict, tsp, column_specifiers["target_columns"])
    
    # Get evaluation metrics
    metrics = zeroshot_trainer.evaluate(dset_test)
    
    return {
        "predictions_df": processed_predictions,
        "metrics": metrics
    }

def process_predictions(predictions_dict, tsp, target_columns):
    """
    Process the predictions from the Trainer into a usable DataFrame.
    
    Args:
        predictions_dict: Predictions from the Trainer
        tsp: TimeSeriesPreprocessor instance
        target_columns: List of target column names
        
    Returns:
        pd.DataFrame: DataFrame containing processed predictions
    """
    # Extract predictions
    if hasattr(predictions_dict, 'predictions'):
        raw_predictions = predictions_dict.predictions
    else:
        raw_predictions = predictions_dict.get('predictions', predictions_dict)
    
    # Handle tuple predictions (mean and uncertainty)
    if isinstance(raw_predictions, tuple):
        predictions = raw_predictions[0]
    else:
        predictions = raw_predictions
    
    # Get shape information
    n_samples, n_timesteps, n_features = predictions.shape
    
    # Create DataFrame for processed predictions
    processed_df = pd.DataFrame()
    
    # Extract predictions for each target and timestep
    for i, col in enumerate(target_columns):
        if i < n_features:
            for t in range(n_timesteps):
                processed_df[f"{col}_step_{t+1}"] = predictions[:, t, i]
    
    return processed_df

def simple_diagonal_averaging(predictions_df, test_data, context_length, step_columns):
    """
    Improved approach to diagonally averaging predictions by patient.
    Properly handles the last rows of predictions to ensure all available 
    predicted values are used.
    
    Args:
        predictions_df (pd.DataFrame): DataFrame with step-wise predictions
        test_data (pd.DataFrame): Original test data with patient IDs
        context_length (int): Number of context steps used in the model
        step_columns (list): List of step column names
    
    Returns:
        pd.DataFrame: DataFrame with averaged predictions
    """
    # Create a new dataframe for the final results
    final_df = test_data.copy()
    
    # Initialize prediction column with zeros/NaN
    final_df['averaged_prediction'] = 0
    
    # Process each patient separately
    for patient_id in test_data['patient_id'].unique():
        # Get indices for this patient
        patient_mask = final_df['patient_id'] == patient_id
        patient_indices = final_df[patient_mask].index
        
        # Skip the first context_length rows for this patient
        start_idx = min(context_length, len(patient_indices))
        
        # For each row after the context window
        for i in range(start_idx, len(patient_indices)):
            row_idx = patient_indices[i]
            pred_row_idx = i - context_length
            
            # Skip if the prediction row index is negative
            if pred_row_idx < 0:
                continue
                
            # Get the corresponding prediction row
            if pred_row_idx < len(predictions_df):
                # Average the predictions for all steps
                avg_prediction = predictions_df.iloc[pred_row_idx][step_columns].mean()
                final_df.loc[row_idx, 'averaged_prediction'] = avg_prediction
            else:
                # Handle the case where we've run out of prediction rows
                # Calculate how many steps beyond the last prediction row we are
                excess_steps = pred_row_idx - len(predictions_df) + 1
                
                # Make sure we don't go beyond the available steps
                if excess_steps < len(step_columns):
                    # Use the last row of predictions but only the appropriate steps
                    relevant_steps = step_columns[excess_steps:]
                    if relevant_steps:
                        avg_prediction = predictions_df.iloc[-1][relevant_steps].mean()
                        final_df.loc[row_idx, 'averaged_prediction'] = avg_prediction
    
    return final_df


def main():
    """
    Main function to execute the time series forecasting workflow.
    """
    # Setup
    # setup_environment()
    
    # Get dataset path
    script_dir = os.path.dirname(os.path.abspath(__file__))
    test_file = os.path.join(script_dir, '..', 'data', 'processed', 'test_dataset.csv')
    train_file = os.path.join(script_dir, '..', 'data', 'processed', 'train_dataset.csv')
    
    # Load and prepare data
    test_data = load_dataset(test_file)
    train_data = load_dataset(train_file)
    column_specs = get_column_specs()
    test_data = prepare_data(test_data, column_specs["timestamp_column"])
    train_data = prepare_data(train_data, column_specs["timestamp_column"])
    
    # Run zero-shot evaluation
    results = zeroshot_eval(
        train_df=train_data,
        test_df=test_data,
        batch_size=8,
        context_length=CONTEXT_LENGTH,
        forecast_length=PREDICTION_LENGTH
    )

    # Get all step columns
    step_columns = [col for col in results["predictions_df"].columns if col.startswith("Glucose_step_")]
    
    # Apply simple diagonal averaging by patient
    final_results = simple_diagonal_averaging(
        results["predictions_df"], 
        test_data, 
        CONTEXT_LENGTH,
        step_columns
    )
    
    # Save raw predictions to CSV
    raw_predictions_path = os.path.join(script_dir, '..', 'data', 'outputs', 'dl_predictions_raw.csv')
    results["predictions_df"].to_csv(raw_predictions_path, index=False)
    
    # Save final results to CSV
    final_results_path = os.path.join(script_dir, '..', 'data', 'outputs', 'dl_predictions.csv')
    final_results.to_csv(final_results_path, index=False)
    
    return

if __name__ == "__main__":
    main()