Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

File size: 4,605 Bytes

5c6dd58
14fc71f
5c6dd58
 
3dd6a8c
cb3748e
5c6dd58
 
14fc71f
 
 
 
 
1d3c9ee
 
14fc71f
 
b11254e
5c6dd58
cb3748e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14fc71f
94c13a3
 
 
cb3748e
 
 
 
5c6dd58
94c13a3
5c6dd58
 
94c13a3
 
 
14fc71f
cb3748e
 
 
 
 
 
 
 
 
 
 
 
 
14fc71f
94c13a3
14fc71f
94c13a3
1d3c9ee
 
14fc71f
 
 
1d3c9ee
 
 
94c13a3
 
3dd6a8c
 
 
 
 
 
7052bfa
3dd6a8c
f4930a4
3dd6a8c
 
 
 
 
7052bfa
3dd6a8c
f4930a4
3dd6a8c
 
 
 
 
14fc71f
 
3dd6a8c
f4930a4
e2ebde2
 
7052bfa
e2ebde2
f4930a4
14fc71f
3dd6a8c
f4930a4
1d3c9ee
f4930a4
14fc71f

import os
from datetime import date, datetime, timedelta

import joblib
import pandas as pd
import torch
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, login
from src.data_api_calls import (
    get_combined_data,
    update_pollution_data,
    update_weather_data,
)
from src.features_pipeline import create_features

load_dotenv()
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))


def load_nn():
    import torch.nn as nn
    from huggingface_hub import PyTorchModelHubMixin

    class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
        def __init__(self, input_size, layers, dropout_rate):
            super(AirPollutionNet, self).__init__()
            self.layers_list = nn.ModuleList()
            in_features = input_size

            for units in layers:
                self.layers_list.append(nn.Linear(in_features, units))
                self.layers_list.append(nn.ReLU())
                self.layers_list.append(nn.Dropout(p=dropout_rate))
                in_features = units

            self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days

        def forward(self, x):
            for layer in self.layers_list:
                x = layer(x)
            x = self.output(x)
            return x

    model = AirPollutionNet.from_pretrained(
        "akseljoonas/Utrecht_pollution_forecasting_NO2"
    )
    return model


def load_model(particle):
    repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
    if particle == "O3":
        file_name = "O3_svr_model.pkl"
        model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
        model = joblib.load(model_path)
    else:
        model = load_nn()

    return model


def run_model(particle, data):
    input_data = create_features(data=data, target_particle=particle)
    model = load_model(particle)

    if particle == "NO2":
        with torch.no_grad():
            prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
        repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
        file_name = "target_scaler_NO2.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)
    else:
        prediction = model.predict(input_data)

        repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
        file_name = f"target_scaler_{particle}.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)

    target_scaler = joblib.load(path)
    prediction = target_scaler.inverse_transform(prediction)

    return prediction


def update_data_and_predictions():
    update_weather_data()
    update_pollution_data()

    week_data = get_combined_data()

    o3_predictions = run_model("O3", data=week_data)
    no2_predictions = run_model("NO2", data=week_data)

    prediction_data = []
    for i in range(3):
        prediction_data.append(
            {
                "pollutant": "O3",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": o3_predictions[0][i],
            }
        )
        prediction_data.append(
            {
                "pollutant": "NO2",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": no2_predictions[0][i],
            }
        )

    predictions_df = pd.DataFrame(prediction_data)

    PREDICTIONS_FILE = "predictions_history.csv"

    if os.path.exists(PREDICTIONS_FILE):
        existing_data = pd.read_csv(PREDICTIONS_FILE)
        # Filter out predictions made today to avoid duplicates
        existing_data = existing_data[
            ~(existing_data["date_predicted"] == str(date.today()))
        ]
        combined_data = pd.concat([existing_data, predictions_df])
        combined_data.drop_duplicates()
    else:
        combined_data = predictions_df

    combined_data.to_csv(PREDICTIONS_FILE, index=False)


def get_data_and_predictions():
    week_data = get_combined_data()

    PREDICTIONS_FILE = "predictions_history.csv"
    data = pd.read_csv(PREDICTIONS_FILE)

    today = datetime.today().strftime("%Y-%m-%d")
    today_predictions = data[(data["date_predicted"] == today)]

    # Extract predictions for O3 and NO2
    o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
        "prediction_value"
    ].values
    no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
        "prediction_value"
    ].values

    return week_data, [o3_predictions], [no2_predictions]