File size: 4,665 Bytes
359c749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
from datetime import date, datetime, timedelta

import joblib
import pandas as pd
import torch
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, login

from src.data_api_calls import (
    get_combined_data,
    update_pollution_data,
    update_weather_data,
)
from src.features_pipeline import create_features

load_dotenv()
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))


def load_nn():
    import torch.nn as nn
    from huggingface_hub import PyTorchModelHubMixin

    class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
        def __init__(self, input_size, layers, dropout_rate):
            super(AirPollutionNet, self).__init__()
            self.layers_list = nn.ModuleList()
            in_features = input_size

            for units in layers:
                self.layers_list.append(nn.Linear(in_features, units))
                self.layers_list.append(nn.ReLU())
                self.layers_list.append(nn.Dropout(p=dropout_rate))
                in_features = units

            self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days

        def forward(self, x):
            for layer in self.layers_list:
                x = layer(x)
            x = self.output(x)
            return x

    model = AirPollutionNet.from_pretrained(
        "akseljoonas/Utrecht_pollution_forecasting_NO2"
    )
    return model


def load_model(particle):
    repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
    if particle == "O3":
        file_name = "O3_svr_model.pkl"
        model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
        model = joblib.load(model_path)
    else:
        model = load_nn()

    return model


def run_model(particle, data):
    input_data = create_features(data=data, target_particle=particle)
    model = load_model(particle)

    if particle == "NO2":
        with torch.no_grad():
            prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
        repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
        file_name = "target_scaler_NO2.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)
    else:
        prediction = model.predict(input_data)

        repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
        file_name = f"target_scaler_{particle}.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)

    target_scaler = joblib.load(path)
    prediction = target_scaler.inverse_transform(prediction)

    return prediction


def update_data_and_predictions():
    update_weather_data()
    update_pollution_data()

    week_data = get_combined_data()

    o3_predictions = run_model("O3", data=week_data)
    no2_predictions = run_model("NO2", data=week_data)

    prediction_data = []
    for i in range(3):
        prediction_data.append(
            {
                "pollutant": "O3",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": o3_predictions[0][i],
            }
        )
        prediction_data.append(
            {
                "pollutant": "NO2",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": no2_predictions[0][i],
            }
        )

    predictions_df = pd.DataFrame(prediction_data)

    PREDICTIONS_FILE = "predictions_history.csv"

    if os.path.exists(PREDICTIONS_FILE):
        existing_data = pd.read_csv(PREDICTIONS_FILE)
        # Filter out predictions made today to avoid duplicates
        existing_data = existing_data[
            ~(existing_data["date_predicted"] == str(date.today()))
        ]
        combined_data = pd.concat([existing_data, predictions_df])
        combined_data.drop_duplicates()
    else:
        combined_data = predictions_df

    combined_data.to_csv(PREDICTIONS_FILE, index=False)


def get_data_and_predictions():
    week_data = get_combined_data()

    PREDICTIONS_FILE = "predictions_history.csv"
    data = pd.read_csv(PREDICTIONS_FILE)

    today = datetime.today().strftime("%Y-%m-%d")
    today_predictions = data[(data["date_predicted"] == today)]

    # Extract predictions for O3 and NO2
    o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
        "prediction_value"
    ].values
    no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
        "prediction_value"
    ].values

    return week_data, [o3_predictions], [no2_predictions]

if __name__=="__main__":
    update_data_and_predictions()