|
import os |
|
import warnings |
|
|
|
import joblib |
|
import numpy as np |
|
import pandas as pd |
|
from dotenv import load_dotenv |
|
from huggingface_hub import hf_hub_download, login |
|
|
|
from src.past_data_api_calls import get_past_combined_data |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
load_dotenv() |
|
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN")) |
|
|
|
|
|
def create_features( |
|
data, |
|
target_particle, |
|
lag_days=7, |
|
sma_days=7, |
|
): |
|
lag_features = [ |
|
"NO2", |
|
"O3", |
|
"wind_speed", |
|
"mean_temp", |
|
"global_radiation", |
|
"minimum_visibility", |
|
"humidity", |
|
] |
|
if target_particle == "NO2": |
|
lag_features = lag_features + ["percipitation", "pressure"] |
|
|
|
if target_particle not in ["O3", "NO2"]: |
|
raise ValueError("target_particle must be 'O3' or 'NO2'") |
|
|
|
data = data.copy() |
|
data["date"] = pd.to_datetime(data["date"]) |
|
data = data.sort_values("date").reset_index(drop=True) |
|
|
|
|
|
if "weekday" not in data.columns or data["weekday"].dtype == object: |
|
data["weekday"] = data["date"].dt.weekday |
|
if "month" not in data.columns: |
|
data["month"] = data["date"].dt.month |
|
|
|
|
|
data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7) |
|
data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7) |
|
data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12) |
|
data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12) |
|
|
|
|
|
for feature in lag_features: |
|
for lag in range(1, lag_days + 1): |
|
data[f"{feature}_lag_{lag}"] = data[feature].shift(lag) |
|
|
|
|
|
for feature in lag_features: |
|
data[f"{feature}_sma_{sma_days}"] = ( |
|
data[feature].rolling(window=sma_days).mean() |
|
) |
|
|
|
|
|
past_data = get_past_combined_data() |
|
|
|
|
|
data["O3_last_year"] = past_data["O3"].iloc[-4] |
|
data["NO2_last_year"] = past_data["NO2"].iloc[-4] |
|
|
|
|
|
for i in range(1, lag_days + 1): |
|
data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1] |
|
data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1] |
|
|
|
|
|
data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1] |
|
data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1] |
|
|
|
|
|
rows_before = data.shape[0] |
|
data = data.dropna().reset_index(drop=True) |
|
rows_after = data.shape[0] |
|
rows_dropped = rows_before - rows_after |
|
print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}") |
|
print(data) |
|
|
|
|
|
data = data.sort_values("date").reset_index(drop=True) |
|
|
|
|
|
exclude_cols = ["date", "weekday", "month"] |
|
feature_cols = [col for col in data.columns if col not in exclude_cols] |
|
|
|
|
|
x = data[feature_cols] |
|
|
|
|
|
repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model" |
|
file_name = f"feature_scaler_{target_particle}.joblib" |
|
path = hf_hub_download(repo_id=repo_id, filename=file_name) |
|
feature_scaler = joblib.load(path) |
|
X_scaled = feature_scaler.transform(x) |
|
|
|
|
|
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index) |
|
|
|
return X_scaled |
|
|