Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

File size: 3,682 Bytes

359c749

import os
import warnings

import joblib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, login

from src.past_data_api_calls import get_past_combined_data

warnings.filterwarnings("ignore")

load_dotenv()
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))


def create_features(
    data,
    target_particle,  # Added this parameter
    lag_days=7,
    sma_days=7,
):
    lag_features = [
        "NO2",
        "O3",
        "wind_speed",
        "mean_temp",
        "global_radiation",
        "minimum_visibility",
        "humidity",
    ]
    if target_particle == "NO2":
        lag_features = lag_features + ["percipitation", "pressure"]

    if target_particle not in ["O3", "NO2"]:
        raise ValueError("target_particle must be 'O3' or 'NO2'")

    data = data.copy()
    data["date"] = pd.to_datetime(data["date"])
    data = data.sort_values("date").reset_index(drop=True)

    # Extract 'weekday' and 'month' from 'date' if not present
    if "weekday" not in data.columns or data["weekday"].dtype == object:
        data["weekday"] = data["date"].dt.weekday  # Monday=0, Sunday=6
    if "month" not in data.columns:
        data["month"] = data["date"].dt.month  # 1 to 12

    # Create sine and cosine transformations for 'weekday' and 'month'
    data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
    data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
    data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
    data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)

    # Create lagged features for the specified lag days
    for feature in lag_features:
        for lag in range(1, lag_days + 1):
            data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)

    # Create SMA features
    for feature in lag_features:
        data[f"{feature}_sma_{sma_days}"] = (
            data[feature].rolling(window=sma_days).mean()
        )

    # Create particle data (NO2 and O3) from the same time last year
    past_data = get_past_combined_data()

    # Today last year
    data["O3_last_year"] = past_data["O3"].iloc[-4]
    data["NO2_last_year"] = past_data["NO2"].iloc[-4]

    # 7 days before today last year
    for i in range(1, lag_days + 1):
        data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
        data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]

    # 3 days after today last year
    data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
    data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]

    # Drop missing values
    rows_before = data.shape[0]
    data = data.dropna().reset_index(drop=True)
    rows_after = data.shape[0]
    rows_dropped = rows_before - rows_after
    print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}")
    print(data)

    # Ensure the data is sorted by date in ascending order
    data = data.sort_values("date").reset_index(drop=True)

    # Define feature columns
    exclude_cols = ["date", "weekday", "month"]
    feature_cols = [col for col in data.columns if col not in exclude_cols]

    # Split features and targets
    x = data[feature_cols]

    # Scale
    repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model"
    file_name = f"feature_scaler_{target_particle}.joblib"
    path = hf_hub_download(repo_id=repo_id, filename=file_name)
    feature_scaler = joblib.load(path)
    X_scaled = feature_scaler.transform(x)

    # Convert scaled data back to DataFrame for consistency
    X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)

    return X_scaled