akseljoonas commited on
Commit
359c749
·
0 Parent(s):

Update .gitlab-ci.yml file

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv/
2
+ .env
3
+ __pycache__/
4
+ *.pyc
5
+ *.joblib
6
+ scalers/
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Utrecht Pollution Prediction
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.39.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: 'Demo: Model to predict O3 and NO2 concentrations in Utrecht'
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/data_api_calls.cpython-312.pyc ADDED
Binary file (10.3 kB). View file
 
__pycache__/data_loading.cpython-312.pyc ADDED
Binary file (7.96 kB). View file
 
__pycache__/helper_functions.cpython-312.pyc ADDED
Binary file (2 kB). View file
 
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import streamlit as st
5
+
6
+ from src.helper_functions import custom_metric_box, pollution_box
7
+ from src.predict import get_data_and_predictions, update_data_and_predictions
8
+
9
+ st.set_page_config(
10
+ page_title="Utrecht Pollution Dashboard ",
11
+ page_icon="🌱",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded",
14
+ )
15
+
16
+ alt.themes.enable("dark")
17
+
18
+ update_data_and_predictions()
19
+
20
+ week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()
21
+
22
+ today = week_data.iloc[-1]
23
+ previous_day = week_data.iloc[-2]
24
+
25
+ dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
26
+ dates_future = pd.date_range(
27
+ start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
28
+ ).to_list()
29
+
30
+ # O3 and NO2 values for the past 7 days
31
+ o3_past_values = week_data["O3"]
32
+ no2_past_values = week_data["NO2"]
33
+ o3_future_values = pd.Series(predictions_O3[0].flatten())
34
+ no2_future_values = pd.Series(predictions_NO2[0].flatten())
35
+ o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
36
+ no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
37
+
38
+ dates = dates_past + dates_future
39
+ df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
40
+
41
+ # App Title
42
+ st.title("Utrecht Pollution Dashboard 🌱")
43
+
44
+ col1, col2 = st.columns((1, 3))
45
+ # Create a 3-column layout
46
+ with col1:
47
+ st.subheader("Current Weather")
48
+
49
+
50
+ custom_metric_box(
51
+ label="🥵 Temperature",
52
+ value=f"{round(today['mean_temp'] * 0.1)} °C",
53
+ )
54
+ custom_metric_box(
55
+ label="💧 Humidity",
56
+ value=f"{round(today['humidity'])} %",
57
+ )
58
+ custom_metric_box(
59
+ label="🪨 Pressure",
60
+ value=f"{round(today['pressure'] * 0.1)} hPa",
61
+ )
62
+
63
+ custom_metric_box(
64
+ label="🌧️ Precipitation",
65
+ value=f"{round(today['percipitation'] * 0.1)} mm",
66
+ )
67
+ custom_metric_box(
68
+ label="🌤️ Solar Radiation",
69
+ value=f"{round(today['global_radiation'])} J/m²",
70
+ )
71
+ custom_metric_box(
72
+ label="🌪️ Wind Speed",
73
+ value=f"{round(today['wind_speed'] * 0.1, 1)} m/s",
74
+ )
75
+
76
+ with col2:
77
+ st.subheader("Current Pollution Levels")
78
+ sub1, sub2 = st.columns((1, 1))
79
+
80
+ # Ozone (O₃) Pollution Box
81
+ with sub1:
82
+ pollution_box(
83
+ label="O<sub>3</sub>",
84
+ value=f"{round(today['O3'])} µg/m³",
85
+ delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
86
+ threshold=120
87
+ )
88
+ with st.expander("Learn more about O3", expanded=False):
89
+ st.markdown(
90
+ """
91
+ *Ozone (O<sub>3</sub>)*: A harmful gas at ground level that can irritate the respiratory system and aggravate asthma.<br>
92
+ **Good/Bad**: "Good" means safe levels for most people, while "Bad" suggests harmful levels, especially for sensitive groups.
93
+ """,
94
+ unsafe_allow_html=True,
95
+ )
96
+
97
+ # Nitrogen Dioxide (NO₂) Pollution Box
98
+ with sub2:
99
+ pollution_box(
100
+ label="NO<sub>2</sub>",
101
+ value=f"{round(today['NO2'])} µg/m³",
102
+ delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
103
+ threshold=40
104
+ )
105
+ with st.expander("Learn more about NO2", expanded=False):
106
+ st.markdown(
107
+ """
108
+ *Nitrogen Dioxide (NO<sub>2</sub>)*: A toxic gas that contributes to lung irritation and worsens asthma and other respiratory issues.<br>
109
+ **Good/Bad**: "Good" means safe air quality, while "Bad" indicates levels that could cause respiratory problems, especially for vulnerable individuals.
110
+ """,
111
+ unsafe_allow_html=True,
112
+ )
113
+
114
+ # Create two columns for two separate graphs
115
+ st.subheader("O3 Forecast")
116
+
117
+ # Define the new color logic: green, orange, and red based on the threshold
118
+ def get_simple_color_scale(values, threshold):
119
+ """Returns green for values below the threshold, orange for values between the threshold and 2x the threshold, and red for values above 2x the threshold."""
120
+ return [
121
+ "#77C124" if v < threshold else
122
+ "#E68B0A" if v < 2 * threshold else
123
+ "#E63946" for v in values
124
+ ]
125
+
126
+ # O3 Bar Plot (threshold: 40)
127
+ o3_past_values = o3_values[:-3] # Last 3 values are predictions
128
+ o3_future_values = o3_values[-3:] # Last 3 values are predictions
129
+ o3_colors = get_simple_color_scale(o3_past_values, 40) # Color for past values
130
+
131
+ fig_o3 = go.Figure()
132
+
133
+ # Add past values
134
+ fig_o3.add_trace(
135
+ go.Bar(
136
+ x=df["Date"][:-3], # Dates for past values
137
+ y=o3_past_values,
138
+ name="O3 Past",
139
+ marker=dict(color=o3_colors), # Apply the color scale
140
+ hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
141
+ )
142
+ )
143
+
144
+ # Add predicted values with reduced opacity
145
+ predicted_o3_colors = get_simple_color_scale(o3_future_values, 40) # Color for future values
146
+ fig_o3.add_trace(
147
+ go.Bar(
148
+ x=df["Date"][-3:], # Dates for predicted values
149
+ y=o3_future_values,
150
+ name="O3 Predicted",
151
+ marker=dict(color=predicted_o3_colors, opacity=0.5), # Set opacity to 0.5 for predictions
152
+ hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
153
+ )
154
+ )
155
+
156
+ fig_o3.add_shape(
157
+ dict(
158
+ type="line",
159
+ x0=pd.Timestamp.today(),
160
+ x1=pd.Timestamp.today(),
161
+ y0=min(o3_values),
162
+ y1=max(o3_values),
163
+ line=dict(color="White", width=3, dash="dash"),
164
+ )
165
+ )
166
+
167
+ fig_o3.update_layout(
168
+ plot_bgcolor="rgba(0, 0, 0, 0)",
169
+ paper_bgcolor="rgba(0, 0, 0, 0)",
170
+ yaxis_title="O3 Concentration (µg/m³)",
171
+ font=dict(size=14),
172
+ hovermode="x",
173
+ xaxis=dict(
174
+ title="Date",
175
+ type="date",
176
+ tickmode="array",
177
+ tickvals=df["Date"],
178
+ tickformat="%d-%b",
179
+ tickangle=-45,
180
+ tickcolor="gray",
181
+ ),
182
+ showlegend=False # Disable legend
183
+ )
184
+
185
+ st.plotly_chart(fig_o3, key="fig_o3")
186
+
187
+ # NO2 Bar Plot (threshold: 120)
188
+ st.subheader("NO2 Forecast")
189
+ no2_past_values = no2_values[:-3] # Last 3 values are predictions
190
+ no2_future_values = no2_values[-3:] # Last 3 values are predictions
191
+ no2_colors = get_simple_color_scale(no2_past_values, 120) # Color for past values
192
+
193
+ fig_no2 = go.Figure()
194
+
195
+ # Add past values
196
+ fig_no2.add_trace(
197
+ go.Bar(
198
+ x=df["Date"][:-3], # Dates for past values
199
+ y=no2_past_values,
200
+ name="NO2 Past",
201
+ marker=dict(color=no2_colors), # Apply the color scale
202
+ hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
203
+ )
204
+ )
205
+
206
+ # Add predicted values with reduced opacity
207
+ predicted_no2_colors = get_simple_color_scale(no2_future_values, 120) # Color for future values
208
+ fig_no2.add_trace(
209
+ go.Bar(
210
+ x=df["Date"][-3:], # Dates for predicted values
211
+ y=no2_future_values,
212
+ name="NO2 Predicted",
213
+ marker=dict(color=predicted_no2_colors, opacity=0.5), # Set opacity to 0.5 for predictions
214
+ hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
215
+ )
216
+ )
217
+
218
+ fig_no2.add_shape(
219
+ dict(
220
+ type="line",
221
+ x0=pd.Timestamp.today(),
222
+ x1=pd.Timestamp.today(),
223
+ y0=min(no2_values),
224
+ y1=max(no2_values),
225
+ line=dict(color="White", width=3, dash="dash"),
226
+ )
227
+ )
228
+
229
+ fig_no2.update_layout(
230
+ plot_bgcolor="rgba(0, 0, 0, 0)",
231
+ paper_bgcolor="rgba(0, 0, 0, 0)",
232
+ yaxis_title="NO<sub>2</sub> Concentration (µg/m³)",
233
+ font=dict(size=14),
234
+ hovermode="x",
235
+ xaxis=dict(
236
+ title="Date",
237
+ type="date",
238
+ tickmode="array",
239
+ tickvals=df["Date"],
240
+ tickformat="%d-%b",
241
+ tickangle=-45,
242
+ tickcolor="gray",
243
+ ),
244
+ showlegend=False # Disable legend
245
+ )
246
+
247
+ st.plotly_chart(fig_no2, key="fig_no2")
pages/admin.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import streamlit as st
5
+ from sklearn.metrics import mean_squared_error
6
+ from src.data_api_calls import get_combined_data
7
+
8
+ USERNAME = "admin"
9
+ PASSWORD = "password"
10
+
11
+ st.title("Admin Panel")
12
+
13
+ # Use session state to remember login state
14
+ if "login_success" not in st.session_state:
15
+ st.session_state.login_success = False
16
+
17
+ # Login Form
18
+ if not st.session_state.login_success:
19
+ with st.form("login_form"):
20
+ st.write("Please login to access the admin dashboard:")
21
+ username = st.text_input("Username")
22
+ password = st.text_input("Password", type="password")
23
+ login_button = st.form_submit_button("Login")
24
+
25
+ if login_button:
26
+ if username == USERNAME and password == PASSWORD:
27
+ st.session_state.login_success = True
28
+ st.success("Login successful!")
29
+ else:
30
+ st.error("Invalid username or password.")
31
+ else:
32
+ # Fetching the combined data
33
+ table_data = get_combined_data()
34
+
35
+ # Check for missing values
36
+ missing_values = table_data.isnull()
37
+
38
+ # Display the main data table
39
+ st.subheader("Data used for the prediction")
40
+
41
+ # Display message based on whether data is complete
42
+ if missing_values.values.any():
43
+ # Warning message if there are missing values
44
+ st.markdown(
45
+ "<h4 style='color: #E68B0A;'>Warning: Some data is missing!</h4>",
46
+ unsafe_allow_html=True,
47
+ )
48
+
49
+ # Identify columns with missing values
50
+ missing_columns = table_data.columns[missing_values.any()].tolist()
51
+
52
+ # Identify rows (dates) with missing values
53
+ missing_rows = table_data[missing_values.any(axis=1)]["Date"].tolist()
54
+
55
+ # Display additional information about missing columns and rows
56
+ if missing_columns:
57
+ st.markdown(f"**Columns with missing data:** {', '.join(missing_columns)}")
58
+ if missing_rows:
59
+ st.markdown(
60
+ f"**Rows with missing data (dates):** {', '.join(missing_rows)}"
61
+ )
62
+ else:
63
+ # Success message if no data is missing
64
+ st.markdown(
65
+ "<h4 style='color: #77C124;'>All data is complete!</h4>",
66
+ unsafe_allow_html=True,
67
+ )
68
+ st.dataframe(table_data)
69
+ # Actual data vs 1,2,3 days ahead predictions
70
+ actual_data = pd.read_csv("pollution_data.csv")
71
+ prediction_data = pd.read_csv("predictions_history.csv")
72
+
73
+ col1, col2 = st.columns(2)
74
+ with col1:
75
+ pollutant = st.radio("Select a pollutant", ("O3", "NO2"))
76
+ with col2:
77
+ days_ahead = st.radio("Select days ahead for prediction", (1, 2, 3))
78
+
79
+ predictions = prediction_data[prediction_data["pollutant"] == pollutant]
80
+ actual = actual_data[["date", pollutant]].rename(
81
+ columns={pollutant: "actual_value"}
82
+ )
83
+
84
+ predictions_filtered = predictions[
85
+ predictions["date_predicted"]
86
+ == (
87
+ pd.to_datetime(predictions["date"]) - pd.Timedelta(days=days_ahead)
88
+ ).dt.strftime("%Y-%m-%d")
89
+ ]
90
+
91
+ fig = go.Figure()
92
+
93
+ fig.add_trace(
94
+ go.Scatter(
95
+ x=actual["date"],
96
+ y=actual["actual_value"],
97
+ mode="lines+markers",
98
+ name="Ground Truth",
99
+ line=dict(color="green", width=3),
100
+ )
101
+ )
102
+
103
+ fig.add_trace(
104
+ go.Scatter(
105
+ x=predictions_filtered["date"],
106
+ y=predictions_filtered["prediction_value"],
107
+ mode="lines+markers",
108
+ name=f"Prediction {days_ahead} day(s) ahead",
109
+ line=dict(dash="dash", color="orange", width=3),
110
+ )
111
+ )
112
+
113
+ fig.update_layout(
114
+ title=f"{pollutant} Predictions vs Actual Values",
115
+ xaxis_title="Date",
116
+ yaxis_title=f"{pollutant} Concentration",
117
+ legend=dict(x=0, y=1),
118
+ yaxis=dict(range=[0, 60]),
119
+ template="plotly_white",
120
+ xaxis=dict(
121
+ title="Date",
122
+ type="date",
123
+ tickmode="array",
124
+ tickvals=predictions["date"],
125
+ tickformat="%d-%b",
126
+ tickangle=-45,
127
+ tickcolor="gray",
128
+ ),
129
+ )
130
+
131
+ st.plotly_chart(fig)
132
+
133
+ # Evaluation Function
134
+ def evaluate_predictions_all_days(actual, predictions):
135
+ rmse_values_all = {"O3": [], "NO2": []}
136
+ smape_values_all = {"O3": [], "NO2": []}
137
+
138
+ for pollutant in ["O3", "NO2"]:
139
+ predictions_pollutant = predictions[predictions["pollutant"] == pollutant]
140
+ actual_pollutant = actual_data[["date", pollutant]].rename(
141
+ columns={pollutant: "actual_value"}
142
+ )
143
+
144
+ # Calculate RMSE and SMAPE for each day (1st, 2nd, and 3rd)
145
+ for i in range(1, 4):
146
+ predictions_filtered = predictions_pollutant[
147
+ predictions_pollutant["date_predicted"]
148
+ == (
149
+ pd.to_datetime(predictions_pollutant["date"])
150
+ - pd.Timedelta(days=i)
151
+ ).dt.strftime("%Y-%m-%d")
152
+ ]
153
+ actual_filtered = actual_pollutant[
154
+ actual_pollutant["date"].isin(predictions_filtered["date"])
155
+ ]
156
+ merged = pd.merge(
157
+ actual_filtered,
158
+ predictions_filtered,
159
+ left_on="date",
160
+ right_on="date",
161
+ )
162
+
163
+ if not merged.empty:
164
+ actual_values = merged["actual_value"].values
165
+ prediction_values = merged["prediction_value"].values
166
+
167
+ rmse = np.sqrt(mean_squared_error(actual_values, prediction_values))
168
+ rmse_values_all[pollutant].append(rmse)
169
+ smape = (
170
+ 100
171
+ / len(actual_values)
172
+ * np.sum(
173
+ 2
174
+ * np.abs(prediction_values - actual_values)
175
+ / (np.abs(actual_values) + np.abs(prediction_values))
176
+ )
177
+ )
178
+ smape_values_all[pollutant].append(smape)
179
+
180
+ # Plot RMSE and SMAPE for both pollutants
181
+ fig_rmse = go.Figure()
182
+ for day in range(3):
183
+ fig_rmse.add_trace(
184
+ go.Bar(
185
+ x=["O3", "NO2"],
186
+ y=[rmse_values_all["O3"][day], rmse_values_all["NO2"][day]],
187
+ name=f"Day {day + 1}",
188
+ )
189
+ )
190
+ fig_rmse.update_layout(
191
+ title="RMSE for Predictions Over 3 Days",
192
+ yaxis_title="RMSE",
193
+ xaxis_title="Pollutant",
194
+ barmode="group",
195
+ )
196
+ st.plotly_chart(fig_rmse)
197
+
198
+ fig_smape = go.Figure()
199
+ for day in range(3):
200
+ fig_smape.add_trace(
201
+ go.Bar(
202
+ x=["O3", "NO2"],
203
+ y=[smape_values_all["O3"][day], smape_values_all["NO2"][day]],
204
+ name=f"Day {day + 1}",
205
+ )
206
+ )
207
+ fig_smape.update_layout(
208
+ title="SMAPE for Predictions Over 3 Days",
209
+ yaxis_title="SMAPE (%)",
210
+ xaxis_title="Pollutant",
211
+ barmode="group",
212
+ )
213
+ st.plotly_chart(fig_smape)
214
+
215
+ # Calculate total current SMAPE and RMSE
216
+ total_O3_smape = sum(smape_values_all["O3"]) / len(smape_values_all)
217
+ total_NO2_smape = sum(smape_values_all["NO2"]) / len(smape_values_all)
218
+ total_O3_rmse = sum(rmse_values_all["O3"]) / len(rmse_values_all)
219
+ total_NO2_rmse = sum(rmse_values_all["NO2"]) / len(rmse_values_all)
220
+
221
+ # Display metrics table
222
+ metrics_data = {
223
+ "Metric": [
224
+ "Current NO2 SMAPE (%)",
225
+ "Current NO2 RMSE (µg/m3)",
226
+ "Current O3 SMAPE (%)",
227
+ "Current O3 RMSE (µg/m3)",
228
+ ],
229
+ "Value": [total_NO2_smape, total_NO2_rmse, total_O3_smape, total_O3_rmse],
230
+ }
231
+ metrics_df = pd.DataFrame(metrics_data)
232
+ st.table(metrics_df)
233
+
234
+ evaluate_predictions_all_days(actual_data, prediction_data)
past_pollution_data.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ date,NO2,O3
2
+ 2023-10-18,10.8427027027027,39.81260000000001
3
+ 2023-10-19,17.97026666666666,31.779024390243908
4
+ 2023-10-20,17.233055555555563,18.7156
5
+ 2023-10-21,15.023599999999991,22.04
6
+ 2023-10-22,8.723378378378372,48.33439999999999
7
+ 2023-10-23,20.63426666666668,15.586000000000002
8
+ 2023-10-24,15.1156,24.62808510638297
9
+ 2023-10-25,22.88567567567568,27.117599999999992
10
+ 2023-10-26,21.53175675675676,13.3216
11
+ 2023-10-27,23.07226666666666,16.15416666666666
12
+ 2023-10-28,24.89121621621622,24.59040816326531
13
+ 2023-10-29,9.724428571428573,51.525200000000005
14
+ 2023-10-30,11.20205479452055,52.820600000000006
15
+ 2023-10-31,17.494666666666667,44.458541666666655
16
+ 2023-11-01,21.588095238095235,29.20631578947369
17
+ 2023-11-02,9.745714285714286,48.39760869565216
18
+ 2023-11-03,7.163243243243242,61.421599999999984
past_weather_data.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
2
+ 2023-10-17,8.5,84.8,0.0,22.3,1019.3,34.8,75.2
3
+ 2023-10-18,9.0,77.9,2.3,25.9,1006.0,23.8,71.2
4
+ 2023-10-19,14.5,94.0,11.4,22.3,990.8,21.2,39.8
5
+ 2023-10-20,11.9,97.4,20.4,25.9,981.0,10.4,7.0
6
+ 2023-10-21,13.1,88.0,3.5,22.3,989.4,27.7,39.9
7
+ 2023-10-22,12.1,87.3,3.9,25.9,1003.6,32.3,55.9
8
+ 2023-10-23,9.9,95.7,0.5,18.0,1011.1,5.9,43.8
9
+ 2023-10-24,11.6,92.3,6.5,22.3,1001.3,23.1,32.6
10
+ 2023-10-25,9.3,96.8,15.3,18.0,996.8,15.7,14.5
11
+ 2023-10-26,9.4,97.6,0.1,11.2,995.6,4.8,36.0
12
+ 2023-10-27,10.6,97.9,11.4,14.8,992.0,9.5,20.5
13
+ 2023-10-28,11.4,88.6,3.0,18.4,994.4,29.3,48.5
14
+ 2023-10-29,13.0,82.2,9.5,31.7,991.5,38.8,35.4
15
+ 2023-10-30,11.2,90.4,13.0,18.4,997.5,28.8,27.0
16
+ 2023-10-31,11.0,93.7,18.6,18.0,1000.7,17.9,29.8
17
+ 2023-11-01,12.4,88.5,4.9,25.9,997.8,32.6,31.5
18
+ 2023-11-02,11.0,80.0,8.7,46.4,976.4,33.6,21.5
19
+ 2023-11-03,9.6,83.3,7.9,32.4,981.6,31,40.1
pollution_data.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ date,NO2,O3
2
+ 2024-10-17,22.804605103280675,22.769159859976643
3
+ 2024-10-18,23.26858769887009,23.30733245729302
4
+ 2024-10-19,23.91006441223834,23.1717142857143
5
+ 2024-10-20,22.57323754789273,23.53784452296821
6
+ 2024-10-21,21.1457004830918,24.02069565217393
7
+ 2024-10-22,21.77657980456027,23.33588571428572
8
+ 2024-10-23,21.974793814433,22.21468879668051
9
+ 2024-10-24,25.51256756756757,20.91370967741937
10
+ 2024-10-25,21.72051282051282,22.33230769230769
11
+ 2024-10-26,24.46423484380123,18.70331123489324
12
+ 2024-10-27,27.53722134983982,20.80809239842384
13
+ 2024-10-28,23.337567567567568,26.82861788617886
14
+ 2024-10-29,16.53533209586906,23.28254887605004
15
+ 2024-10-30,22.26162162162162,18.03443548387097
16
+ 2024-10-31,24.919333333333334,20.79696
predictions_history.csv ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pollutant,date_predicted,date,prediction_value
2
+ O3,2024-10-14,2024-10-17,31.25335185244893
3
+ NO2,2024-10-14,2024-10-17,26.421736787446267
4
+ O3,2024-10-15,2024-10-17,22.00005767760448
5
+ NO2,2024-10-15,2024-10-17,28.59511317503212
6
+ O3,2024-10-16,2024-10-17,9.657466070999735
7
+ NO2,2024-10-16,2024-10-17,17.065168790519902
8
+ O3,2024-10-15,2024-10-18,6.561248
9
+ NO2,2024-10-15,2024-10-18,26.443672
10
+ O3,2024-10-16,2024-10-18,19.782418
11
+ NO2,2024-10-16,2024-10-18,36.453956
12
+ O3,2024-10-17,2024-10-18,16.08841798553393
13
+ NO2,2024-10-17,2024-10-18,32.0458143607889
14
+ O3,2024-10-16,2024-10-19,24.031357603260783
15
+ NO2,2024-10-16,2024-10-19,20.08389395558791
16
+ O3,2024-10-17,2024-10-19,21.031357603260783
17
+ NO2,2024-10-17,2024-10-19,27.08389395558791
18
+ O3,2024-10-17,2024-10-20,20.48486247979324
19
+ NO2,2024-10-17,2024-10-20,23.84300578029378
20
+ O3,2024-10-18,2024-10-19,22.304547122637445
21
+ NO2,2024-10-18,2024-10-19,20.80017116560889
22
+ O3,2024-10-18,2024-10-20,31.25335185244893
23
+ NO2,2024-10-18,2024-10-20,29.732316066240585
24
+ O3,2024-10-18,2024-10-21,28.67755196805434
25
+ NO2,2024-10-18,2024-10-21,35.04638743773354
26
+ O3,2024-10-19,2024-10-20,26.421736787446267
27
+ NO2,2024-10-19,2024-10-20,27.399885723190767
28
+ O3,2024-10-19,2024-10-21,17.065168790519902
29
+ NO2,2024-10-19,2024-10-21,18.992352714813563
30
+ O3,2024-10-19,2024-10-22,17.39682962048955
31
+ NO2,2024-10-19,2024-10-22,22.85061675885908
32
+ O3,2024-10-20,2024-10-21,22.00005767760448
33
+ NO2,2024-10-20,2024-10-21,18.27191592927812
34
+ O3,2024-10-20,2024-10-22,29.00940466937953
35
+ NO2,2024-10-20,2024-10-22,19.50739766963497
36
+ O3,2024-10-20,2024-10-23,20.062134354543343
37
+ NO2,2024-10-20,2024-10-23,23.65746607099973
38
+ O3,2024-10-21,2024-10-22,17.497382318189132
39
+ NO2,2024-10-21,2024-10-22,28.59511317503212
40
+ O3,2024-10-21,2024-10-23,16.519952190354232
41
+ NO2,2024-10-21,2024-10-23,30.192389708351826
42
+ O3,2024-10-21,2024-10-24,28.19940385112904
43
+ NO2,2024-10-21,2024-10-24,17.9525039623211
44
+ O3,2024-10-22,2024-10-23,16.093074246425157
45
+ NO2,2024-10-22,2024-10-23,25.217639978187005
46
+ O3,2024-10-22,2024-10-24,23.605545201596552
47
+ NO2,2024-10-22,2024-10-24,29.004701753536988
48
+ O3,2024-10-23,2024-10-24,26.56486295059828
49
+ NO2,2024-10-23,2024-10-24,20.15373733747257
50
+ O3,2024-10-24,2024-10-25,10.33808859423279
51
+ NO2,2024-10-24,2024-10-25,25.68519991558237
52
+ O3,2024-10-24,2024-10-26,16.000984317626852
53
+ NO2,2024-10-24,2024-10-26,25.760307451092384
54
+ O3,2024-10-24,2024-10-27,19.64377495640328
55
+ NO2,2024-10-24,2024-10-27,31.210576791105115
56
+ O3,2024-10-25,2024-10-26,20.48055947200643
57
+ NO2,2024-10-25,2024-10-26,23.95723903986424
58
+ O3,2024-10-25,2024-10-27,11.088152958498888
59
+ NO2,2024-10-25,2024-10-27,32.274494671100506
60
+ O3,2024-10-25,2024-10-28,-0.7175631399505704
61
+ NO2,2024-10-25,2024-10-28,40.86107800019054
62
+ O3,2024-10-28,2024-10-29,22.13652238154496
63
+ NO2,2024-10-28,2024-10-29,31.608886931951144
64
+ O3,2024-10-28,2024-10-30,15.841669224
65
+ NO2,2024-10-28,2024-10-30,34.564284711452984
66
+ O3,2024-10-28,2024-10-31,22.35944571003375
67
+ NO2,2024-10-28,2024-10-31,34.37482132111927
68
+ O3,2024-10-30,2024-10-31,15.98046542733637
69
+ NO2,2024-10-30,2024-10-31,29.77507241979599
70
+ O3,2024-10-30,2024-11-01,21.135906183680472
71
+ NO2,2024-10-30,2024-11-01,28.38872595850704
72
+ O3,2024-10-30,2024-11-02,19.67426015042635
73
+ O3,2024-10-31,2024-11-01,16.491393851863755
74
+ NO2,2024-10-31,2024-11-01,17.22825222459993
75
+ O3,2024-10-31,2024-11-02,16.874728806873033
76
+ NO2,2024-10-31,2024-11-02,14.771381333796965
77
+ O3,2024-10-31,2024-11-03,15.244292496093546
78
+ NO2,2024-10-31,2024-11-03,14.606430068166452
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ joblib # or pickle if you're using that to load the model
5
+ scikit-learn # for mock model
6
+ altair
7
+ matplotlib
8
+ plotly
9
+ http.client
10
+ datetime
11
+ huggingface-hub
12
+ python-dotenv
13
+ torch
14
+ safetensors
src/data_api_calls.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import csv
3
+ import http.client
4
+ import os
5
+ import re
6
+ import sys
7
+ import urllib.request
8
+ from datetime import date, timedelta
9
+ from io import StringIO
10
+
11
+ import pandas as pd
12
+
13
+ WEATHER_DATA_FILE = "weather_data.csv"
14
+ POLLUTION_DATA_FILE = "pollution_data.csv"
15
+
16
+
17
+ def update_weather_data():
18
+ today = date.today().isoformat()
19
+
20
+ if os.path.exists(WEATHER_DATA_FILE):
21
+ df = pd.read_csv(WEATHER_DATA_FILE)
22
+ last_date = pd.to_datetime(df["date"]).max()
23
+ start_date = (last_date + timedelta(1)).isoformat()
24
+ else:
25
+ df = pd.DataFrame()
26
+ start_date = (date.today() - timedelta(7)).isoformat()
27
+
28
+ try:
29
+ ResultBytes = urllib.request.urlopen(
30
+ f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
31
+ )
32
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
33
+
34
+ new_data = pd.DataFrame(list(CSVText))
35
+ new_data.columns = new_data.iloc[0]
36
+ new_data = new_data[1:]
37
+ new_data = new_data.rename(columns={"datetime": "date"})
38
+
39
+ updated_df = pd.concat([df, new_data], ignore_index=True)
40
+ updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
41
+ updated_df.to_csv(WEATHER_DATA_FILE, index=False)
42
+
43
+ except urllib.error.HTTPError as e:
44
+ ErrorInfo = e.read().decode()
45
+ print("Error code: ", e.code, ErrorInfo)
46
+ sys.exit()
47
+ except urllib.error.URLError as e:
48
+ ErrorInfo = e.read().decode()
49
+ print("Error code: ", e.code, ErrorInfo)
50
+ sys.exit()
51
+
52
+
53
+ def update_pollution_data():
54
+ O3 = []
55
+ NO2 = []
56
+ particles = ["NO2", "O3"]
57
+ stations = ["NL10636", "NL10639", "NL10643"]
58
+ all_dataframes = []
59
+ today = date.today().isoformat() + "T09:00:00Z"
60
+ yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
61
+
62
+ if os.path.exists(POLLUTION_DATA_FILE):
63
+ existing_data = pd.read_csv(POLLUTION_DATA_FILE)
64
+ last_date = pd.to_datetime(existing_data["date"]).max()
65
+ if last_date >= pd.Timestamp(date.today()):
66
+ print("Data is already up to date.")
67
+ return
68
+
69
+ # Only pull data for today if not already updated
70
+ for particle in particles:
71
+ for station in stations:
72
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
73
+ payload = ""
74
+ headers = {}
75
+ conn.request(
76
+ "GET",
77
+ f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
78
+ payload,
79
+ headers,
80
+ )
81
+ res = conn.getresponse()
82
+ data = res.read()
83
+ decoded_data = data.decode("utf-8")
84
+ df = pd.read_csv(StringIO(decoded_data))
85
+ df = df.filter(like="value")
86
+ all_dataframes.append(df)
87
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
88
+ values = []
89
+
90
+ for row in combined_data:
91
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
92
+ if cleaned_value:
93
+ values.append(float(cleaned_value[0]))
94
+
95
+ if values:
96
+ avg = sum(values) / len(values)
97
+ if particle == "NO2":
98
+ NO2.append(avg)
99
+ else:
100
+ O3.append(avg)
101
+
102
+ new_data = pd.DataFrame(
103
+ {
104
+ "date": [date.today()],
105
+ "NO2": NO2,
106
+ "O3": O3,
107
+ }
108
+ )
109
+
110
+ updated_data = pd.concat([existing_data, new_data], ignore_index=True)
111
+ updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
112
+
113
+ updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
114
+
115
+
116
+ def get_combined_data():
117
+
118
+ weather_df = pd.read_csv(WEATHER_DATA_FILE)
119
+
120
+ today = pd.Timestamp.now().normalize()
121
+ seven_days_ago = today - pd.Timedelta(days=7)
122
+ weather_df["date"] = pd.to_datetime(weather_df["date"])
123
+ weather_df = weather_df[(weather_df["date"] >= seven_days_ago) & (weather_df["date"] <= today)]
124
+
125
+ weather_df.insert(1, "NO2", None)
126
+ weather_df.insert(2, "O3", None)
127
+ weather_df.insert(10, "weekday", None)
128
+ columns = list(weather_df.columns)
129
+ columns.insert(3, columns.pop(6))
130
+ weather_df = weather_df[columns]
131
+ columns.insert(5, columns.pop(9))
132
+ weather_df = weather_df[columns]
133
+ columns.insert(9, columns.pop(6))
134
+ weather_df = weather_df[columns]
135
+
136
+ combined_df = weather_df
137
+
138
+ # Apply scaling and renaming similar to the scale function from previous code
139
+ combined_df = combined_df.rename(
140
+ columns={
141
+ "date": "date",
142
+ "windspeed": "wind_speed",
143
+ "temp": "mean_temp",
144
+ "solarradiation": "global_radiation",
145
+ "precip": "percipitation",
146
+ "sealevelpressure": "pressure",
147
+ "visibility": "minimum_visibility",
148
+ }
149
+ )
150
+
151
+ combined_df["date"] = pd.to_datetime(combined_df["date"])
152
+ combined_df["weekday"] = combined_df["date"].dt.day_name()
153
+
154
+ combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
155
+ combined_df["mean_temp"] = combined_df["mean_temp"] * 10
156
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
157
+ combined_df["percipitation"] = combined_df["percipitation"] * 10
158
+ combined_df["pressure"] = combined_df["pressure"] * 10
159
+
160
+ combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
161
+ combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
162
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
163
+ combined_df["percipitation"] = combined_df["percipitation"].astype(int)
164
+ combined_df["pressure"] = combined_df["pressure"].astype(int)
165
+ combined_df["humidity"] = combined_df["humidity"].astype(int)
166
+ combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
167
+
168
+ pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
169
+
170
+ pollution_df["date"] = pd.to_datetime(pollution_df["date"])
171
+ pollution_df = pollution_df[(pollution_df["date"] >= seven_days_ago) & (pollution_df["date"] <= today)]
172
+
173
+ combined_df["NO2"] = pollution_df["NO2"]
174
+ combined_df["O3"] = pollution_df["O3"]
175
+
176
+ return combined_df
src/features_pipeline.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+
4
+ import joblib
5
+ import numpy as np
6
+ import pandas as pd
7
+ from dotenv import load_dotenv
8
+ from huggingface_hub import hf_hub_download, login
9
+
10
+ from src.past_data_api_calls import get_past_combined_data
11
+
12
+ warnings.filterwarnings("ignore")
13
+
14
+ load_dotenv()
15
+ login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
16
+
17
+
18
+ def create_features(
19
+ data,
20
+ target_particle, # Added this parameter
21
+ lag_days=7,
22
+ sma_days=7,
23
+ ):
24
+ lag_features = [
25
+ "NO2",
26
+ "O3",
27
+ "wind_speed",
28
+ "mean_temp",
29
+ "global_radiation",
30
+ "minimum_visibility",
31
+ "humidity",
32
+ ]
33
+ if target_particle == "NO2":
34
+ lag_features = lag_features + ["percipitation", "pressure"]
35
+
36
+ if target_particle not in ["O3", "NO2"]:
37
+ raise ValueError("target_particle must be 'O3' or 'NO2'")
38
+
39
+ data = data.copy()
40
+ data["date"] = pd.to_datetime(data["date"])
41
+ data = data.sort_values("date").reset_index(drop=True)
42
+
43
+ # Extract 'weekday' and 'month' from 'date' if not present
44
+ if "weekday" not in data.columns or data["weekday"].dtype == object:
45
+ data["weekday"] = data["date"].dt.weekday # Monday=0, Sunday=6
46
+ if "month" not in data.columns:
47
+ data["month"] = data["date"].dt.month # 1 to 12
48
+
49
+ # Create sine and cosine transformations for 'weekday' and 'month'
50
+ data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
51
+ data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
52
+ data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
53
+ data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
54
+
55
+ # Create lagged features for the specified lag days
56
+ for feature in lag_features:
57
+ for lag in range(1, lag_days + 1):
58
+ data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)
59
+
60
+ # Create SMA features
61
+ for feature in lag_features:
62
+ data[f"{feature}_sma_{sma_days}"] = (
63
+ data[feature].rolling(window=sma_days).mean()
64
+ )
65
+
66
+ # Create particle data (NO2 and O3) from the same time last year
67
+ past_data = get_past_combined_data()
68
+
69
+ # Today last year
70
+ data["O3_last_year"] = past_data["O3"].iloc[-4]
71
+ data["NO2_last_year"] = past_data["NO2"].iloc[-4]
72
+
73
+ # 7 days before today last year
74
+ for i in range(1, lag_days + 1):
75
+ data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
76
+ data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
77
+
78
+ # 3 days after today last year
79
+ data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
80
+ data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
81
+
82
+ # Drop missing values
83
+ rows_before = data.shape[0]
84
+ data = data.dropna().reset_index(drop=True)
85
+ rows_after = data.shape[0]
86
+ rows_dropped = rows_before - rows_after
87
+ print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}")
88
+ print(data)
89
+
90
+ # Ensure the data is sorted by date in ascending order
91
+ data = data.sort_values("date").reset_index(drop=True)
92
+
93
+ # Define feature columns
94
+ exclude_cols = ["date", "weekday", "month"]
95
+ feature_cols = [col for col in data.columns if col not in exclude_cols]
96
+
97
+ # Split features and targets
98
+ x = data[feature_cols]
99
+
100
+ # Scale
101
+ repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model"
102
+ file_name = f"feature_scaler_{target_particle}.joblib"
103
+ path = hf_hub_download(repo_id=repo_id, filename=file_name)
104
+ feature_scaler = joblib.load(path)
105
+ X_scaled = feature_scaler.transform(x)
106
+
107
+ # Convert scaled data back to DataFrame for consistency
108
+ X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
109
+
110
+ return X_scaled
src/helper_functions.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ # Custom function to create styled metric boxes with compact layout
5
+ def custom_metric_box(label, value):
6
+ st.markdown(f"""
7
+ <div style="
8
+ padding: 5px;
9
+ margin-bottom: 5px;
10
+ width: 100%; /* Full width */
11
+ display: flex;
12
+ flex-direction: column; /* Align items vertically */
13
+ align-items: flex-start; /* Align all content to the left */
14
+ ">
15
+ <div>
16
+ <h4 style="font-size: 14px; font-weight: normal; margin: 0;">{label}</h4> <!-- Smaller label -->
17
+ </div>
18
+ <div>
19
+ <p style="font-size: 18px; font-weight: bold; margin: 0;">{value}</p> <!-- Smaller metric -->
20
+ </div>
21
+ </div>
22
+ """, unsafe_allow_html=True)
23
+
24
+ # Custom function to create pollution metric boxes with side-by-side layout for label and value
25
+ # Custom function to create pollution metric boxes with side-by-side layout and fixed width
26
+ def pollution_box(label, value, delta, threshold):
27
+ # Determine if the pollution level is "Good" or "Bad"
28
+ status = "Good" if float(value.split()[0]) < threshold else "Bad"
29
+ status_color = "#77C124" if status == "Good" else "#E68B0A"
30
+
31
+ # Render the pollution box
32
+ st.markdown(f"""
33
+ <div style="
34
+ background: rgba(255, 255, 255, 0.05);
35
+ border-radius: 16px;
36
+ box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
37
+ backdrop-filter: blur(5px);
38
+ -webkit-backdrop-filter: blur(5px);
39
+ border: 1px solid rgba(255, 255, 255, 0.15);
40
+ padding: 15px;
41
+ margin-bottom: 10px;
42
+ ">
43
+ <h4 style="font-size: 24px; font-weight: bold; margin: 0;">{label}</h4> <!-- Bigger label -->
44
+ <p style="font-size: 36px; font-weight: bold; color: {status_color}; margin: 0;">{status}</p> <!-- Good/Bad with color -->
45
+ <p style="font-size: 18px; margin: 0;">{value}</p> <!-- Smaller value where delta used to be -->
46
+ </div>
47
+ """, unsafe_allow_html=True)
src/past_data_api_calls.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import csv
3
+ import http.client
4
+ import os
5
+ import re
6
+ import sys
7
+ import urllib.request
8
+ from datetime import date, timedelta
9
+ from io import StringIO
10
+
11
+ import pandas as pd
12
+
13
+ PAST_WEATHER_DATA_FILE = "past_weather_data.csv"
14
+ PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
15
+
16
+
17
+ def update_past_weather_data():
18
+ last_year_date = date.today() - timedelta(days=365)
19
+
20
+ if os.path.exists(PAST_WEATHER_DATA_FILE):
21
+ df = pd.read_csv(PAST_WEATHER_DATA_FILE)
22
+ start_date = pd.to_datetime(df["date"]).max().date().isoformat()
23
+ end_date = (last_year_date + timedelta(days=2)).isoformat()
24
+ else:
25
+ df = pd.DataFrame()
26
+ start_date = (last_year_date - timedelta(days=8)).isoformat()
27
+ end_date = (last_year_date + timedelta(days=2)).isoformat()
28
+
29
+ try:
30
+ ResultBytes = urllib.request.urlopen(
31
+ f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
32
+ )
33
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
34
+
35
+ data = pd.DataFrame(list(CSVText))
36
+ data.columns = data.iloc[0]
37
+ data = data[1:]
38
+ data = data.rename(columns={"datetime": "date"})
39
+
40
+ updated_df = pd.concat([df, data], ignore_index=True)
41
+ updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
42
+ updated_df.to_csv(PAST_WEATHER_DATA_FILE, index=False)
43
+
44
+ except urllib.error.HTTPError as e:
45
+ ErrorInfo = e.read().decode()
46
+ print("Error code: ", e.code, ErrorInfo)
47
+ sys.exit()
48
+ except urllib.error.URLError as e:
49
+ ErrorInfo = e.read().decode()
50
+ print("Error code: ", e.code, ErrorInfo)
51
+ sys.exit()
52
+
53
+
54
+ def update_past_pollution_data():
55
+ O3 = []
56
+ NO2 = []
57
+ particles = ["NO2", "O3"]
58
+ stations = ["NL10636", "NL10639", "NL10643"]
59
+ all_dataframes = []
60
+
61
+ last_year_date = date.today() - timedelta(days=365)
62
+
63
+ if os.path.exists(PAST_POLLUTION_DATA_FILE):
64
+ existing_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
65
+ last_date = pd.to_datetime(existing_data["date"]).max()
66
+ if last_date >= pd.to_datetime(last_year_date):
67
+ print("Data is already up to date.")
68
+ return
69
+ else:
70
+ start_date = last_date.date()
71
+ end_date = last_year_date + timedelta(days=3)
72
+ else:
73
+ existing_data = pd.DataFrame()
74
+ start_date = last_year_date - timedelta(days=7)
75
+ end_date = last_year_date + timedelta(days=3)
76
+
77
+ date_list = [
78
+ start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
79
+ ]
80
+ for current_date in date_list:
81
+ today = current_date.isoformat() + "T09:00:00Z"
82
+ yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
83
+ for particle in particles:
84
+ all_dataframes = [] # Reset for each particle
85
+ for station in stations:
86
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
87
+ payload = ""
88
+ headers = {}
89
+ conn.request(
90
+ "GET",
91
+ f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
92
+ payload,
93
+ headers,
94
+ )
95
+ res = conn.getresponse()
96
+ data = res.read()
97
+ decoded_data = data.decode("utf-8")
98
+ df = pd.read_csv(StringIO(decoded_data))
99
+ df = df.filter(like="value")
100
+ all_dataframes.append(df)
101
+
102
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
103
+ values = []
104
+ for row in combined_data:
105
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
106
+ if cleaned_value:
107
+ values.append(float(cleaned_value[0]))
108
+
109
+ if values:
110
+ avg = sum(values) / len(values)
111
+ if particle == "NO2":
112
+ NO2.append(avg)
113
+ else:
114
+ O3.append(avg)
115
+
116
+ new_data = pd.DataFrame(
117
+ {
118
+ "date": date_list,
119
+ "NO2": NO2,
120
+ "O3": O3,
121
+ }
122
+ )
123
+
124
+ updated_data = pd.concat([existing_data, new_data], ignore_index=True)
125
+ updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
126
+
127
+ updated_data.to_csv(PAST_POLLUTION_DATA_FILE, index=False)
128
+
129
+ return NO2, O3
130
+
131
+
132
+ def get_past_combined_data():
133
+ update_past_weather_data()
134
+ update_past_pollution_data()
135
+
136
+ combined_df = pd.read_csv(PAST_WEATHER_DATA_FILE)
137
+ pollution_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
138
+
139
+ combined_df = combined_df.merge(pollution_data, on="date", how="inner")
140
+ combined_df = combined_df.tail(11)
141
+
142
+ # Apply scaling and renaming similar to the scale function from previous code
143
+ combined_df = combined_df.rename(
144
+ columns={
145
+ "date": "date",
146
+ "windspeed": "wind_speed",
147
+ "temp": "mean_temp",
148
+ "solarradiation": "global_radiation",
149
+ "precip": "percipitation",
150
+ "sealevelpressure": "pressure",
151
+ "visibility": "minimum_visibility",
152
+ }
153
+ )
154
+
155
+ combined_df["date"] = pd.to_datetime(combined_df["date"])
156
+ combined_df["weekday"] = combined_df["date"].dt.day_name()
157
+
158
+ combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
159
+ combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
160
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
161
+ combined_df["percipitation"] = combined_df["percipitation"].astype(float)
162
+ combined_df["pressure"] = combined_df["pressure"].astype(float).round()
163
+ combined_df["humidity"] = combined_df["humidity"].astype(float).round()
164
+ combined_df["global_radiation"] = combined_df["global_radiation"].astype(float)
165
+
166
+ combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
167
+ combined_df["mean_temp"] = combined_df["mean_temp"] * 10
168
+ combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
169
+ combined_df["percipitation"] = combined_df["percipitation"] * 10
170
+ combined_df["pressure"] = combined_df["pressure"] * 10
171
+
172
+ combined_df["wind_speed"] = (
173
+ combined_df["wind_speed"].astype(float).round().astype(int)
174
+ )
175
+ combined_df["mean_temp"] = (
176
+ combined_df["mean_temp"].astype(float).round().astype(int)
177
+ )
178
+ combined_df["minimum_visibility"] = (
179
+ combined_df["minimum_visibility"].astype(float).round().astype(int)
180
+ )
181
+ combined_df["percipitation"] = (
182
+ combined_df["percipitation"].astype(float).round().astype(int)
183
+ )
184
+ combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
185
+ combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
186
+ combined_df["global_radiation"] = (
187
+ combined_df["global_radiation"].astype(float).round().astype(int)
188
+ )
189
+
190
+ return combined_df
src/predict.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import date, datetime, timedelta
3
+
4
+ import joblib
5
+ import pandas as pd
6
+ import torch
7
+ from dotenv import load_dotenv
8
+ from huggingface_hub import hf_hub_download, login
9
+
10
+ from src.data_api_calls import (
11
+ get_combined_data,
12
+ update_pollution_data,
13
+ update_weather_data,
14
+ )
15
+ from src.features_pipeline import create_features
16
+
17
+ load_dotenv()
18
+ login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
19
+
20
+
21
+ def load_nn():
22
+ import torch.nn as nn
23
+ from huggingface_hub import PyTorchModelHubMixin
24
+
25
+ class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
26
+ def __init__(self, input_size, layers, dropout_rate):
27
+ super(AirPollutionNet, self).__init__()
28
+ self.layers_list = nn.ModuleList()
29
+ in_features = input_size
30
+
31
+ for units in layers:
32
+ self.layers_list.append(nn.Linear(in_features, units))
33
+ self.layers_list.append(nn.ReLU())
34
+ self.layers_list.append(nn.Dropout(p=dropout_rate))
35
+ in_features = units
36
+
37
+ self.output = nn.Linear(in_features, 3) # Output size is 3 for next 3 days
38
+
39
+ def forward(self, x):
40
+ for layer in self.layers_list:
41
+ x = layer(x)
42
+ x = self.output(x)
43
+ return x
44
+
45
+ model = AirPollutionNet.from_pretrained(
46
+ "akseljoonas/Utrecht_pollution_forecasting_NO2"
47
+ )
48
+ return model
49
+
50
+
51
+ def load_model(particle):
52
+ repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
53
+ if particle == "O3":
54
+ file_name = "O3_svr_model.pkl"
55
+ model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
56
+ model = joblib.load(model_path)
57
+ else:
58
+ model = load_nn()
59
+
60
+ return model
61
+
62
+
63
+ def run_model(particle, data):
64
+ input_data = create_features(data=data, target_particle=particle)
65
+ model = load_model(particle)
66
+
67
+ if particle == "NO2":
68
+ with torch.no_grad():
69
+ prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
70
+ repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
71
+ file_name = "target_scaler_NO2.joblib"
72
+ path = hf_hub_download(repo_id=repo_id, filename=file_name)
73
+ else:
74
+ prediction = model.predict(input_data)
75
+
76
+ repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
77
+ file_name = f"target_scaler_{particle}.joblib"
78
+ path = hf_hub_download(repo_id=repo_id, filename=file_name)
79
+
80
+ target_scaler = joblib.load(path)
81
+ prediction = target_scaler.inverse_transform(prediction)
82
+
83
+ return prediction
84
+
85
+
86
+ def update_data_and_predictions():
87
+ update_weather_data()
88
+ update_pollution_data()
89
+
90
+ week_data = get_combined_data()
91
+
92
+ o3_predictions = run_model("O3", data=week_data)
93
+ no2_predictions = run_model("NO2", data=week_data)
94
+
95
+ prediction_data = []
96
+ for i in range(3):
97
+ prediction_data.append(
98
+ {
99
+ "pollutant": "O3",
100
+ "date_predicted": date.today(),
101
+ "date": date.today() + timedelta(days=i + 1),
102
+ "prediction_value": o3_predictions[0][i],
103
+ }
104
+ )
105
+ prediction_data.append(
106
+ {
107
+ "pollutant": "NO2",
108
+ "date_predicted": date.today(),
109
+ "date": date.today() + timedelta(days=i + 1),
110
+ "prediction_value": no2_predictions[0][i],
111
+ }
112
+ )
113
+
114
+ predictions_df = pd.DataFrame(prediction_data)
115
+
116
+ PREDICTIONS_FILE = "predictions_history.csv"
117
+
118
+ if os.path.exists(PREDICTIONS_FILE):
119
+ existing_data = pd.read_csv(PREDICTIONS_FILE)
120
+ # Filter out predictions made today to avoid duplicates
121
+ existing_data = existing_data[
122
+ ~(existing_data["date_predicted"] == str(date.today()))
123
+ ]
124
+ combined_data = pd.concat([existing_data, predictions_df])
125
+ combined_data.drop_duplicates()
126
+ else:
127
+ combined_data = predictions_df
128
+
129
+ combined_data.to_csv(PREDICTIONS_FILE, index=False)
130
+
131
+
132
+ def get_data_and_predictions():
133
+ week_data = get_combined_data()
134
+
135
+ PREDICTIONS_FILE = "predictions_history.csv"
136
+ data = pd.read_csv(PREDICTIONS_FILE)
137
+
138
+ today = datetime.today().strftime("%Y-%m-%d")
139
+ today_predictions = data[(data["date_predicted"] == today)]
140
+
141
+ # Extract predictions for O3 and NO2
142
+ o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
143
+ "prediction_value"
144
+ ].values
145
+ no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
146
+ "prediction_value"
147
+ ].values
148
+
149
+ return week_data, [o3_predictions], [no2_predictions]
150
+
151
+ if __name__=="__main__":
152
+ update_data_and_predictions()
weather_data.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
2
+ 2024-10-17,16.9,86.0,0.6,18.4,1010.0,37.1,43.0
3
+ 2024-10-18,15.5,97.3,3.9,7.6,1014.0,4.5,42.9
4
+ 2024-10-19,14.7,89.9,1.6,14.8,1014.1,22.8,43.5
5
+ 2024-10-20,15.5,83.8,0.5,29.5,1016.0,41.5,0.0
6
+ 2024-10-21,14.4,92.7,4.3,21.2,1020.6,22.0,27.8
7
+ 2024-10-22,11.4,92.8,4.9,19.4,1026.9,22.6,57.0
8
+ 2024-10-23,11.2,97.3,0.0,13.0,1032.8,6.5,12.5
9
+ 2024-10-24,10.4,94.0,0.0,20.5,1024.7,13.0,62.5
10
+ 2024-10-25,13.6,92.2,0.5,11.9,1016.8,24.0,93.0
11
+ 2024-10-26,13.7,91.5,0.0,11.9,1016.3,23.3,8.0
12
+ 2024-10-27,13.2,87.1,0.1,20.5,1019.4,10.4,28.6
13
+ 2024-10-28,12.4,91.8,1.1,31.7,1021.8,12.8,27.3
14
+ 2024-10-29,13.8,95.9,0.2,20.5,1023.1,8.1,16.0
15
+ 2024-10-30,12.7,92.9,0.6,9.4,1027.5,12.5,32.8
16
+ 2024-10-31,12.5,89.9,0.0,11.2,1027.1,17.1,70.6