File size: 6,229 Bytes
359c749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
 
 
359c749
 
 
 
8db7b4c
359c749
 
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
 
 
 
 
 
359c749
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
 
 
 
 
 
359c749
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
 
 
 
 
 
 
359c749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
359c749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
 
 
 
 
 
 
 
359c749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8db7b4c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
from datetime import date, datetime, timedelta

import joblib
import pandas as pd
import torch
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, login
from src.data_api_calls import (
    get_combined_data,
    update_pollution_data,
    update_weather_data,
)
from src.features_pipeline import create_features

load_dotenv()
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))


def load_nn() -> torch.nn.Module:
    """
    Loads the neural network model for air pollution forecasting.

    Returns:
        torch.nn.Module: The loaded neural network model.
    """
    import torch.nn as nn
    from huggingface_hub import PyTorchModelHubMixin

    class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
        def __init__(self, input_size: int, layers: list[int], dropout_rate: float):
            super(AirPollutionNet, self).__init__()
            self.layers_list = nn.ModuleList()
            in_features = input_size

            for units in layers:
                self.layers_list.append(nn.Linear(in_features, units))
                self.layers_list.append(nn.ReLU())
                self.layers_list.append(nn.Dropout(p=dropout_rate))
                in_features = units

            self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            """
            Forward pass of the neural network.

            Args:
                x (torch.Tensor): Input tensor.

            Returns:
                torch.Tensor: Output tensor after passing through the network.
            """
            for layer in self.layers_list:
                x = layer(x)
            x = self.output(x)
            return x

    model = AirPollutionNet.from_pretrained(
        "akseljoonas/Utrecht_pollution_forecasting_NO2"
    )
    return model


def load_model(particle: str) -> object:
    """
    Loads the forecasting model based on the specified particle.

    Args:
        particle (str): The type of particle ("O3" or "NO2").

    Returns:
        object: The loaded model (either a neural network or a support vector regression model).
    """
    repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
    if particle == "O3":
        file_name = "O3_svr_model.pkl"
        model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
        model = joblib.load(model_path)
    else:
        model = load_nn()

    return model


def run_model(particle: str, data: pd.DataFrame) -> list:
    """
    Runs the model for the specified particle and makes predictions based on the input data.

    Args:
        particle (str): The type of particle ("O3" or "NO2").
        data (pd.DataFrame): The input data for making predictions.

    Returns:
        list: The predictions for the specified particle.
    """
    input_data = create_features(data=data, target_particle=particle)
    model = load_model(particle)

    if particle == "NO2":
        with torch.no_grad():
            prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
        repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
        file_name = "target_scaler_NO2.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)
    else:
        prediction = model.predict(input_data)

        repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
        file_name = f"target_scaler_{particle}.joblib"
        path = hf_hub_download(repo_id=repo_id, filename=file_name)

    target_scaler = joblib.load(path)
    prediction = target_scaler.inverse_transform(prediction)

    return prediction


def update_data_and_predictions() -> None:
    """
    Updates the weather and pollution data, makes predictions for O3 and NO2,
    and stores them in a CSV file.
    """
    update_weather_data()
    update_pollution_data()

    week_data = get_combined_data()

    o3_predictions = run_model("O3", data=week_data)
    no2_predictions = run_model("NO2", data=week_data)

    prediction_data = []
    for i in range(3):
        prediction_data.append(
            {
                "pollutant": "O3",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": o3_predictions[0][i],
            }
        )
        prediction_data.append(
            {
                "pollutant": "NO2",
                "date_predicted": date.today(),
                "date": date.today() + timedelta(days=i + 1),
                "prediction_value": no2_predictions[0][i],
            }
        )

    predictions_df = pd.DataFrame(prediction_data)

    PREDICTIONS_FILE = "predictions_history.csv"

    if os.path.exists(PREDICTIONS_FILE):
        existing_data = pd.read_csv(PREDICTIONS_FILE)
        # Filter out predictions made today to avoid duplicates
        existing_data = existing_data[
            ~(existing_data["date_predicted"] == str(date.today()))
        ]
        combined_data = pd.concat([existing_data, predictions_df])
        combined_data.drop_duplicates()
    else:
        combined_data = predictions_df

    combined_data.to_csv(PREDICTIONS_FILE, index=False)


def get_data_and_predictions() -> tuple[pd.DataFrame, list, list]:
    """
    Retrieves combined data and today's predictions for O3 and NO2.

    Returns:
        tuple: A tuple containing:
            - week_data (pd.DataFrame): The combined data for the week.
            - list: Predictions for O3.
            - list: Predictions for NO2.
    """
    week_data = get_combined_data()

    PREDICTIONS_FILE = "predictions_history.csv"
    data = pd.read_csv(PREDICTIONS_FILE)

    today = datetime.today().strftime("%Y-%m-%d")
    today_predictions = data[(data["date_predicted"] == today)]

    # Extract predictions for O3 and NO2
    o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
        "prediction_value"
    ].values
    no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
        "prediction_value"
    ].values

    return week_data, [o3_predictions], [no2_predictions]


if __name__ == "__main__":
    update_data_and_predictions()