File size: 3,682 Bytes
359c749 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import warnings
import joblib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, login
from src.past_data_api_calls import get_past_combined_data
warnings.filterwarnings("ignore")
load_dotenv()
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
def create_features(
data,
target_particle, # Added this parameter
lag_days=7,
sma_days=7,
):
lag_features = [
"NO2",
"O3",
"wind_speed",
"mean_temp",
"global_radiation",
"minimum_visibility",
"humidity",
]
if target_particle == "NO2":
lag_features = lag_features + ["percipitation", "pressure"]
if target_particle not in ["O3", "NO2"]:
raise ValueError("target_particle must be 'O3' or 'NO2'")
data = data.copy()
data["date"] = pd.to_datetime(data["date"])
data = data.sort_values("date").reset_index(drop=True)
# Extract 'weekday' and 'month' from 'date' if not present
if "weekday" not in data.columns or data["weekday"].dtype == object:
data["weekday"] = data["date"].dt.weekday # Monday=0, Sunday=6
if "month" not in data.columns:
data["month"] = data["date"].dt.month # 1 to 12
# Create sine and cosine transformations for 'weekday' and 'month'
data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
# Create lagged features for the specified lag days
for feature in lag_features:
for lag in range(1, lag_days + 1):
data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)
# Create SMA features
for feature in lag_features:
data[f"{feature}_sma_{sma_days}"] = (
data[feature].rolling(window=sma_days).mean()
)
# Create particle data (NO2 and O3) from the same time last year
past_data = get_past_combined_data()
# Today last year
data["O3_last_year"] = past_data["O3"].iloc[-4]
data["NO2_last_year"] = past_data["NO2"].iloc[-4]
# 7 days before today last year
for i in range(1, lag_days + 1):
data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
# 3 days after today last year
data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
# Drop missing values
rows_before = data.shape[0]
data = data.dropna().reset_index(drop=True)
rows_after = data.shape[0]
rows_dropped = rows_before - rows_after
print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}")
print(data)
# Ensure the data is sorted by date in ascending order
data = data.sort_values("date").reset_index(drop=True)
# Define feature columns
exclude_cols = ["date", "weekday", "month"]
feature_cols = [col for col in data.columns if col not in exclude_cols]
# Split features and targets
x = data[feature_cols]
# Scale
repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model"
file_name = f"feature_scaler_{target_particle}.joblib"
path = hf_hub_download(repo_id=repo_id, filename=file_name)
feature_scaler = joblib.load(path)
X_scaled = feature_scaler.transform(x)
# Convert scaled data back to DataFrame for consistency
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
return X_scaled
|