Commit
·
359c749
0
Parent(s):
Update .gitlab-ci.yml file
Browse files- .gitattributes +35 -0
- .gitignore +6 -0
- README.md +13 -0
- __pycache__/data_api_calls.cpython-312.pyc +0 -0
- __pycache__/data_loading.cpython-312.pyc +0 -0
- __pycache__/helper_functions.cpython-312.pyc +0 -0
- app.py +247 -0
- pages/admin.py +234 -0
- past_pollution_data.csv +18 -0
- past_weather_data.csv +19 -0
- pollution_data.csv +16 -0
- predictions_history.csv +78 -0
- requirements.txt +14 -0
- src/data_api_calls.py +176 -0
- src/features_pipeline.py +110 -0
- src/helper_functions.py +47 -0
- src/past_data_api_calls.py +190 -0
- src/predict.py +152 -0
- weather_data.csv +16 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
.env
|
3 |
+
__pycache__/
|
4 |
+
*.pyc
|
5 |
+
*.joblib
|
6 |
+
scalers/
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Utrecht Pollution Prediction
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.39.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
short_description: 'Demo: Model to predict O3 and NO2 concentrations in Utrecht'
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/data_api_calls.cpython-312.pyc
ADDED
Binary file (10.3 kB). View file
|
|
__pycache__/data_loading.cpython-312.pyc
ADDED
Binary file (7.96 kB). View file
|
|
__pycache__/helper_functions.cpython-312.pyc
ADDED
Binary file (2 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import altair as alt
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
from src.helper_functions import custom_metric_box, pollution_box
|
7 |
+
from src.predict import get_data_and_predictions, update_data_and_predictions
|
8 |
+
|
9 |
+
st.set_page_config(
|
10 |
+
page_title="Utrecht Pollution Dashboard ",
|
11 |
+
page_icon="🌱",
|
12 |
+
layout="wide",
|
13 |
+
initial_sidebar_state="expanded",
|
14 |
+
)
|
15 |
+
|
16 |
+
alt.themes.enable("dark")
|
17 |
+
|
18 |
+
update_data_and_predictions()
|
19 |
+
|
20 |
+
week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()
|
21 |
+
|
22 |
+
today = week_data.iloc[-1]
|
23 |
+
previous_day = week_data.iloc[-2]
|
24 |
+
|
25 |
+
dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
|
26 |
+
dates_future = pd.date_range(
|
27 |
+
start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
|
28 |
+
).to_list()
|
29 |
+
|
30 |
+
# O3 and NO2 values for the past 7 days
|
31 |
+
o3_past_values = week_data["O3"]
|
32 |
+
no2_past_values = week_data["NO2"]
|
33 |
+
o3_future_values = pd.Series(predictions_O3[0].flatten())
|
34 |
+
no2_future_values = pd.Series(predictions_NO2[0].flatten())
|
35 |
+
o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
|
36 |
+
no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
|
37 |
+
|
38 |
+
dates = dates_past + dates_future
|
39 |
+
df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
|
40 |
+
|
41 |
+
# App Title
|
42 |
+
st.title("Utrecht Pollution Dashboard 🌱")
|
43 |
+
|
44 |
+
col1, col2 = st.columns((1, 3))
|
45 |
+
# Create a 3-column layout
|
46 |
+
with col1:
|
47 |
+
st.subheader("Current Weather")
|
48 |
+
|
49 |
+
|
50 |
+
custom_metric_box(
|
51 |
+
label="🥵 Temperature",
|
52 |
+
value=f"{round(today['mean_temp'] * 0.1)} °C",
|
53 |
+
)
|
54 |
+
custom_metric_box(
|
55 |
+
label="💧 Humidity",
|
56 |
+
value=f"{round(today['humidity'])} %",
|
57 |
+
)
|
58 |
+
custom_metric_box(
|
59 |
+
label="🪨 Pressure",
|
60 |
+
value=f"{round(today['pressure'] * 0.1)} hPa",
|
61 |
+
)
|
62 |
+
|
63 |
+
custom_metric_box(
|
64 |
+
label="🌧️ Precipitation",
|
65 |
+
value=f"{round(today['percipitation'] * 0.1)} mm",
|
66 |
+
)
|
67 |
+
custom_metric_box(
|
68 |
+
label="🌤️ Solar Radiation",
|
69 |
+
value=f"{round(today['global_radiation'])} J/m²",
|
70 |
+
)
|
71 |
+
custom_metric_box(
|
72 |
+
label="🌪️ Wind Speed",
|
73 |
+
value=f"{round(today['wind_speed'] * 0.1, 1)} m/s",
|
74 |
+
)
|
75 |
+
|
76 |
+
with col2:
|
77 |
+
st.subheader("Current Pollution Levels")
|
78 |
+
sub1, sub2 = st.columns((1, 1))
|
79 |
+
|
80 |
+
# Ozone (O₃) Pollution Box
|
81 |
+
with sub1:
|
82 |
+
pollution_box(
|
83 |
+
label="O<sub>3</sub>",
|
84 |
+
value=f"{round(today['O3'])} µg/m³",
|
85 |
+
delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
|
86 |
+
threshold=120
|
87 |
+
)
|
88 |
+
with st.expander("Learn more about O3", expanded=False):
|
89 |
+
st.markdown(
|
90 |
+
"""
|
91 |
+
*Ozone (O<sub>3</sub>)*: A harmful gas at ground level that can irritate the respiratory system and aggravate asthma.<br>
|
92 |
+
**Good/Bad**: "Good" means safe levels for most people, while "Bad" suggests harmful levels, especially for sensitive groups.
|
93 |
+
""",
|
94 |
+
unsafe_allow_html=True,
|
95 |
+
)
|
96 |
+
|
97 |
+
# Nitrogen Dioxide (NO₂) Pollution Box
|
98 |
+
with sub2:
|
99 |
+
pollution_box(
|
100 |
+
label="NO<sub>2</sub>",
|
101 |
+
value=f"{round(today['NO2'])} µg/m³",
|
102 |
+
delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
|
103 |
+
threshold=40
|
104 |
+
)
|
105 |
+
with st.expander("Learn more about NO2", expanded=False):
|
106 |
+
st.markdown(
|
107 |
+
"""
|
108 |
+
*Nitrogen Dioxide (NO<sub>2</sub>)*: A toxic gas that contributes to lung irritation and worsens asthma and other respiratory issues.<br>
|
109 |
+
**Good/Bad**: "Good" means safe air quality, while "Bad" indicates levels that could cause respiratory problems, especially for vulnerable individuals.
|
110 |
+
""",
|
111 |
+
unsafe_allow_html=True,
|
112 |
+
)
|
113 |
+
|
114 |
+
# Create two columns for two separate graphs
|
115 |
+
st.subheader("O3 Forecast")
|
116 |
+
|
117 |
+
# Define the new color logic: green, orange, and red based on the threshold
|
118 |
+
def get_simple_color_scale(values, threshold):
|
119 |
+
"""Returns green for values below the threshold, orange for values between the threshold and 2x the threshold, and red for values above 2x the threshold."""
|
120 |
+
return [
|
121 |
+
"#77C124" if v < threshold else
|
122 |
+
"#E68B0A" if v < 2 * threshold else
|
123 |
+
"#E63946" for v in values
|
124 |
+
]
|
125 |
+
|
126 |
+
# O3 Bar Plot (threshold: 40)
|
127 |
+
o3_past_values = o3_values[:-3] # Last 3 values are predictions
|
128 |
+
o3_future_values = o3_values[-3:] # Last 3 values are predictions
|
129 |
+
o3_colors = get_simple_color_scale(o3_past_values, 40) # Color for past values
|
130 |
+
|
131 |
+
fig_o3 = go.Figure()
|
132 |
+
|
133 |
+
# Add past values
|
134 |
+
fig_o3.add_trace(
|
135 |
+
go.Bar(
|
136 |
+
x=df["Date"][:-3], # Dates for past values
|
137 |
+
y=o3_past_values,
|
138 |
+
name="O3 Past",
|
139 |
+
marker=dict(color=o3_colors), # Apply the color scale
|
140 |
+
hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
|
141 |
+
)
|
142 |
+
)
|
143 |
+
|
144 |
+
# Add predicted values with reduced opacity
|
145 |
+
predicted_o3_colors = get_simple_color_scale(o3_future_values, 40) # Color for future values
|
146 |
+
fig_o3.add_trace(
|
147 |
+
go.Bar(
|
148 |
+
x=df["Date"][-3:], # Dates for predicted values
|
149 |
+
y=o3_future_values,
|
150 |
+
name="O3 Predicted",
|
151 |
+
marker=dict(color=predicted_o3_colors, opacity=0.5), # Set opacity to 0.5 for predictions
|
152 |
+
hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
|
153 |
+
)
|
154 |
+
)
|
155 |
+
|
156 |
+
fig_o3.add_shape(
|
157 |
+
dict(
|
158 |
+
type="line",
|
159 |
+
x0=pd.Timestamp.today(),
|
160 |
+
x1=pd.Timestamp.today(),
|
161 |
+
y0=min(o3_values),
|
162 |
+
y1=max(o3_values),
|
163 |
+
line=dict(color="White", width=3, dash="dash"),
|
164 |
+
)
|
165 |
+
)
|
166 |
+
|
167 |
+
fig_o3.update_layout(
|
168 |
+
plot_bgcolor="rgba(0, 0, 0, 0)",
|
169 |
+
paper_bgcolor="rgba(0, 0, 0, 0)",
|
170 |
+
yaxis_title="O3 Concentration (µg/m³)",
|
171 |
+
font=dict(size=14),
|
172 |
+
hovermode="x",
|
173 |
+
xaxis=dict(
|
174 |
+
title="Date",
|
175 |
+
type="date",
|
176 |
+
tickmode="array",
|
177 |
+
tickvals=df["Date"],
|
178 |
+
tickformat="%d-%b",
|
179 |
+
tickangle=-45,
|
180 |
+
tickcolor="gray",
|
181 |
+
),
|
182 |
+
showlegend=False # Disable legend
|
183 |
+
)
|
184 |
+
|
185 |
+
st.plotly_chart(fig_o3, key="fig_o3")
|
186 |
+
|
187 |
+
# NO2 Bar Plot (threshold: 120)
|
188 |
+
st.subheader("NO2 Forecast")
|
189 |
+
no2_past_values = no2_values[:-3] # Last 3 values are predictions
|
190 |
+
no2_future_values = no2_values[-3:] # Last 3 values are predictions
|
191 |
+
no2_colors = get_simple_color_scale(no2_past_values, 120) # Color for past values
|
192 |
+
|
193 |
+
fig_no2 = go.Figure()
|
194 |
+
|
195 |
+
# Add past values
|
196 |
+
fig_no2.add_trace(
|
197 |
+
go.Bar(
|
198 |
+
x=df["Date"][:-3], # Dates for past values
|
199 |
+
y=no2_past_values,
|
200 |
+
name="NO2 Past",
|
201 |
+
marker=dict(color=no2_colors), # Apply the color scale
|
202 |
+
hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
|
203 |
+
)
|
204 |
+
)
|
205 |
+
|
206 |
+
# Add predicted values with reduced opacity
|
207 |
+
predicted_no2_colors = get_simple_color_scale(no2_future_values, 120) # Color for future values
|
208 |
+
fig_no2.add_trace(
|
209 |
+
go.Bar(
|
210 |
+
x=df["Date"][-3:], # Dates for predicted values
|
211 |
+
y=no2_future_values,
|
212 |
+
name="NO2 Predicted",
|
213 |
+
marker=dict(color=predicted_no2_colors, opacity=0.5), # Set opacity to 0.5 for predictions
|
214 |
+
hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
|
215 |
+
)
|
216 |
+
)
|
217 |
+
|
218 |
+
fig_no2.add_shape(
|
219 |
+
dict(
|
220 |
+
type="line",
|
221 |
+
x0=pd.Timestamp.today(),
|
222 |
+
x1=pd.Timestamp.today(),
|
223 |
+
y0=min(no2_values),
|
224 |
+
y1=max(no2_values),
|
225 |
+
line=dict(color="White", width=3, dash="dash"),
|
226 |
+
)
|
227 |
+
)
|
228 |
+
|
229 |
+
fig_no2.update_layout(
|
230 |
+
plot_bgcolor="rgba(0, 0, 0, 0)",
|
231 |
+
paper_bgcolor="rgba(0, 0, 0, 0)",
|
232 |
+
yaxis_title="NO<sub>2</sub> Concentration (µg/m³)",
|
233 |
+
font=dict(size=14),
|
234 |
+
hovermode="x",
|
235 |
+
xaxis=dict(
|
236 |
+
title="Date",
|
237 |
+
type="date",
|
238 |
+
tickmode="array",
|
239 |
+
tickvals=df["Date"],
|
240 |
+
tickformat="%d-%b",
|
241 |
+
tickangle=-45,
|
242 |
+
tickcolor="gray",
|
243 |
+
),
|
244 |
+
showlegend=False # Disable legend
|
245 |
+
)
|
246 |
+
|
247 |
+
st.plotly_chart(fig_no2, key="fig_no2")
|
pages/admin.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
import streamlit as st
|
5 |
+
from sklearn.metrics import mean_squared_error
|
6 |
+
from src.data_api_calls import get_combined_data
|
7 |
+
|
8 |
+
USERNAME = "admin"
|
9 |
+
PASSWORD = "password"
|
10 |
+
|
11 |
+
st.title("Admin Panel")
|
12 |
+
|
13 |
+
# Use session state to remember login state
|
14 |
+
if "login_success" not in st.session_state:
|
15 |
+
st.session_state.login_success = False
|
16 |
+
|
17 |
+
# Login Form
|
18 |
+
if not st.session_state.login_success:
|
19 |
+
with st.form("login_form"):
|
20 |
+
st.write("Please login to access the admin dashboard:")
|
21 |
+
username = st.text_input("Username")
|
22 |
+
password = st.text_input("Password", type="password")
|
23 |
+
login_button = st.form_submit_button("Login")
|
24 |
+
|
25 |
+
if login_button:
|
26 |
+
if username == USERNAME and password == PASSWORD:
|
27 |
+
st.session_state.login_success = True
|
28 |
+
st.success("Login successful!")
|
29 |
+
else:
|
30 |
+
st.error("Invalid username or password.")
|
31 |
+
else:
|
32 |
+
# Fetching the combined data
|
33 |
+
table_data = get_combined_data()
|
34 |
+
|
35 |
+
# Check for missing values
|
36 |
+
missing_values = table_data.isnull()
|
37 |
+
|
38 |
+
# Display the main data table
|
39 |
+
st.subheader("Data used for the prediction")
|
40 |
+
|
41 |
+
# Display message based on whether data is complete
|
42 |
+
if missing_values.values.any():
|
43 |
+
# Warning message if there are missing values
|
44 |
+
st.markdown(
|
45 |
+
"<h4 style='color: #E68B0A;'>Warning: Some data is missing!</h4>",
|
46 |
+
unsafe_allow_html=True,
|
47 |
+
)
|
48 |
+
|
49 |
+
# Identify columns with missing values
|
50 |
+
missing_columns = table_data.columns[missing_values.any()].tolist()
|
51 |
+
|
52 |
+
# Identify rows (dates) with missing values
|
53 |
+
missing_rows = table_data[missing_values.any(axis=1)]["Date"].tolist()
|
54 |
+
|
55 |
+
# Display additional information about missing columns and rows
|
56 |
+
if missing_columns:
|
57 |
+
st.markdown(f"**Columns with missing data:** {', '.join(missing_columns)}")
|
58 |
+
if missing_rows:
|
59 |
+
st.markdown(
|
60 |
+
f"**Rows with missing data (dates):** {', '.join(missing_rows)}"
|
61 |
+
)
|
62 |
+
else:
|
63 |
+
# Success message if no data is missing
|
64 |
+
st.markdown(
|
65 |
+
"<h4 style='color: #77C124;'>All data is complete!</h4>",
|
66 |
+
unsafe_allow_html=True,
|
67 |
+
)
|
68 |
+
st.dataframe(table_data)
|
69 |
+
# Actual data vs 1,2,3 days ahead predictions
|
70 |
+
actual_data = pd.read_csv("pollution_data.csv")
|
71 |
+
prediction_data = pd.read_csv("predictions_history.csv")
|
72 |
+
|
73 |
+
col1, col2 = st.columns(2)
|
74 |
+
with col1:
|
75 |
+
pollutant = st.radio("Select a pollutant", ("O3", "NO2"))
|
76 |
+
with col2:
|
77 |
+
days_ahead = st.radio("Select days ahead for prediction", (1, 2, 3))
|
78 |
+
|
79 |
+
predictions = prediction_data[prediction_data["pollutant"] == pollutant]
|
80 |
+
actual = actual_data[["date", pollutant]].rename(
|
81 |
+
columns={pollutant: "actual_value"}
|
82 |
+
)
|
83 |
+
|
84 |
+
predictions_filtered = predictions[
|
85 |
+
predictions["date_predicted"]
|
86 |
+
== (
|
87 |
+
pd.to_datetime(predictions["date"]) - pd.Timedelta(days=days_ahead)
|
88 |
+
).dt.strftime("%Y-%m-%d")
|
89 |
+
]
|
90 |
+
|
91 |
+
fig = go.Figure()
|
92 |
+
|
93 |
+
fig.add_trace(
|
94 |
+
go.Scatter(
|
95 |
+
x=actual["date"],
|
96 |
+
y=actual["actual_value"],
|
97 |
+
mode="lines+markers",
|
98 |
+
name="Ground Truth",
|
99 |
+
line=dict(color="green", width=3),
|
100 |
+
)
|
101 |
+
)
|
102 |
+
|
103 |
+
fig.add_trace(
|
104 |
+
go.Scatter(
|
105 |
+
x=predictions_filtered["date"],
|
106 |
+
y=predictions_filtered["prediction_value"],
|
107 |
+
mode="lines+markers",
|
108 |
+
name=f"Prediction {days_ahead} day(s) ahead",
|
109 |
+
line=dict(dash="dash", color="orange", width=3),
|
110 |
+
)
|
111 |
+
)
|
112 |
+
|
113 |
+
fig.update_layout(
|
114 |
+
title=f"{pollutant} Predictions vs Actual Values",
|
115 |
+
xaxis_title="Date",
|
116 |
+
yaxis_title=f"{pollutant} Concentration",
|
117 |
+
legend=dict(x=0, y=1),
|
118 |
+
yaxis=dict(range=[0, 60]),
|
119 |
+
template="plotly_white",
|
120 |
+
xaxis=dict(
|
121 |
+
title="Date",
|
122 |
+
type="date",
|
123 |
+
tickmode="array",
|
124 |
+
tickvals=predictions["date"],
|
125 |
+
tickformat="%d-%b",
|
126 |
+
tickangle=-45,
|
127 |
+
tickcolor="gray",
|
128 |
+
),
|
129 |
+
)
|
130 |
+
|
131 |
+
st.plotly_chart(fig)
|
132 |
+
|
133 |
+
# Evaluation Function
|
134 |
+
def evaluate_predictions_all_days(actual, predictions):
|
135 |
+
rmse_values_all = {"O3": [], "NO2": []}
|
136 |
+
smape_values_all = {"O3": [], "NO2": []}
|
137 |
+
|
138 |
+
for pollutant in ["O3", "NO2"]:
|
139 |
+
predictions_pollutant = predictions[predictions["pollutant"] == pollutant]
|
140 |
+
actual_pollutant = actual_data[["date", pollutant]].rename(
|
141 |
+
columns={pollutant: "actual_value"}
|
142 |
+
)
|
143 |
+
|
144 |
+
# Calculate RMSE and SMAPE for each day (1st, 2nd, and 3rd)
|
145 |
+
for i in range(1, 4):
|
146 |
+
predictions_filtered = predictions_pollutant[
|
147 |
+
predictions_pollutant["date_predicted"]
|
148 |
+
== (
|
149 |
+
pd.to_datetime(predictions_pollutant["date"])
|
150 |
+
- pd.Timedelta(days=i)
|
151 |
+
).dt.strftime("%Y-%m-%d")
|
152 |
+
]
|
153 |
+
actual_filtered = actual_pollutant[
|
154 |
+
actual_pollutant["date"].isin(predictions_filtered["date"])
|
155 |
+
]
|
156 |
+
merged = pd.merge(
|
157 |
+
actual_filtered,
|
158 |
+
predictions_filtered,
|
159 |
+
left_on="date",
|
160 |
+
right_on="date",
|
161 |
+
)
|
162 |
+
|
163 |
+
if not merged.empty:
|
164 |
+
actual_values = merged["actual_value"].values
|
165 |
+
prediction_values = merged["prediction_value"].values
|
166 |
+
|
167 |
+
rmse = np.sqrt(mean_squared_error(actual_values, prediction_values))
|
168 |
+
rmse_values_all[pollutant].append(rmse)
|
169 |
+
smape = (
|
170 |
+
100
|
171 |
+
/ len(actual_values)
|
172 |
+
* np.sum(
|
173 |
+
2
|
174 |
+
* np.abs(prediction_values - actual_values)
|
175 |
+
/ (np.abs(actual_values) + np.abs(prediction_values))
|
176 |
+
)
|
177 |
+
)
|
178 |
+
smape_values_all[pollutant].append(smape)
|
179 |
+
|
180 |
+
# Plot RMSE and SMAPE for both pollutants
|
181 |
+
fig_rmse = go.Figure()
|
182 |
+
for day in range(3):
|
183 |
+
fig_rmse.add_trace(
|
184 |
+
go.Bar(
|
185 |
+
x=["O3", "NO2"],
|
186 |
+
y=[rmse_values_all["O3"][day], rmse_values_all["NO2"][day]],
|
187 |
+
name=f"Day {day + 1}",
|
188 |
+
)
|
189 |
+
)
|
190 |
+
fig_rmse.update_layout(
|
191 |
+
title="RMSE for Predictions Over 3 Days",
|
192 |
+
yaxis_title="RMSE",
|
193 |
+
xaxis_title="Pollutant",
|
194 |
+
barmode="group",
|
195 |
+
)
|
196 |
+
st.plotly_chart(fig_rmse)
|
197 |
+
|
198 |
+
fig_smape = go.Figure()
|
199 |
+
for day in range(3):
|
200 |
+
fig_smape.add_trace(
|
201 |
+
go.Bar(
|
202 |
+
x=["O3", "NO2"],
|
203 |
+
y=[smape_values_all["O3"][day], smape_values_all["NO2"][day]],
|
204 |
+
name=f"Day {day + 1}",
|
205 |
+
)
|
206 |
+
)
|
207 |
+
fig_smape.update_layout(
|
208 |
+
title="SMAPE for Predictions Over 3 Days",
|
209 |
+
yaxis_title="SMAPE (%)",
|
210 |
+
xaxis_title="Pollutant",
|
211 |
+
barmode="group",
|
212 |
+
)
|
213 |
+
st.plotly_chart(fig_smape)
|
214 |
+
|
215 |
+
# Calculate total current SMAPE and RMSE
|
216 |
+
total_O3_smape = sum(smape_values_all["O3"]) / len(smape_values_all)
|
217 |
+
total_NO2_smape = sum(smape_values_all["NO2"]) / len(smape_values_all)
|
218 |
+
total_O3_rmse = sum(rmse_values_all["O3"]) / len(rmse_values_all)
|
219 |
+
total_NO2_rmse = sum(rmse_values_all["NO2"]) / len(rmse_values_all)
|
220 |
+
|
221 |
+
# Display metrics table
|
222 |
+
metrics_data = {
|
223 |
+
"Metric": [
|
224 |
+
"Current NO2 SMAPE (%)",
|
225 |
+
"Current NO2 RMSE (µg/m3)",
|
226 |
+
"Current O3 SMAPE (%)",
|
227 |
+
"Current O3 RMSE (µg/m3)",
|
228 |
+
],
|
229 |
+
"Value": [total_NO2_smape, total_NO2_rmse, total_O3_smape, total_O3_rmse],
|
230 |
+
}
|
231 |
+
metrics_df = pd.DataFrame(metrics_data)
|
232 |
+
st.table(metrics_df)
|
233 |
+
|
234 |
+
evaluate_predictions_all_days(actual_data, prediction_data)
|
past_pollution_data.csv
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
date,NO2,O3
|
2 |
+
2023-10-18,10.8427027027027,39.81260000000001
|
3 |
+
2023-10-19,17.97026666666666,31.779024390243908
|
4 |
+
2023-10-20,17.233055555555563,18.7156
|
5 |
+
2023-10-21,15.023599999999991,22.04
|
6 |
+
2023-10-22,8.723378378378372,48.33439999999999
|
7 |
+
2023-10-23,20.63426666666668,15.586000000000002
|
8 |
+
2023-10-24,15.1156,24.62808510638297
|
9 |
+
2023-10-25,22.88567567567568,27.117599999999992
|
10 |
+
2023-10-26,21.53175675675676,13.3216
|
11 |
+
2023-10-27,23.07226666666666,16.15416666666666
|
12 |
+
2023-10-28,24.89121621621622,24.59040816326531
|
13 |
+
2023-10-29,9.724428571428573,51.525200000000005
|
14 |
+
2023-10-30,11.20205479452055,52.820600000000006
|
15 |
+
2023-10-31,17.494666666666667,44.458541666666655
|
16 |
+
2023-11-01,21.588095238095235,29.20631578947369
|
17 |
+
2023-11-02,9.745714285714286,48.39760869565216
|
18 |
+
2023-11-03,7.163243243243242,61.421599999999984
|
past_weather_data.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
|
2 |
+
2023-10-17,8.5,84.8,0.0,22.3,1019.3,34.8,75.2
|
3 |
+
2023-10-18,9.0,77.9,2.3,25.9,1006.0,23.8,71.2
|
4 |
+
2023-10-19,14.5,94.0,11.4,22.3,990.8,21.2,39.8
|
5 |
+
2023-10-20,11.9,97.4,20.4,25.9,981.0,10.4,7.0
|
6 |
+
2023-10-21,13.1,88.0,3.5,22.3,989.4,27.7,39.9
|
7 |
+
2023-10-22,12.1,87.3,3.9,25.9,1003.6,32.3,55.9
|
8 |
+
2023-10-23,9.9,95.7,0.5,18.0,1011.1,5.9,43.8
|
9 |
+
2023-10-24,11.6,92.3,6.5,22.3,1001.3,23.1,32.6
|
10 |
+
2023-10-25,9.3,96.8,15.3,18.0,996.8,15.7,14.5
|
11 |
+
2023-10-26,9.4,97.6,0.1,11.2,995.6,4.8,36.0
|
12 |
+
2023-10-27,10.6,97.9,11.4,14.8,992.0,9.5,20.5
|
13 |
+
2023-10-28,11.4,88.6,3.0,18.4,994.4,29.3,48.5
|
14 |
+
2023-10-29,13.0,82.2,9.5,31.7,991.5,38.8,35.4
|
15 |
+
2023-10-30,11.2,90.4,13.0,18.4,997.5,28.8,27.0
|
16 |
+
2023-10-31,11.0,93.7,18.6,18.0,1000.7,17.9,29.8
|
17 |
+
2023-11-01,12.4,88.5,4.9,25.9,997.8,32.6,31.5
|
18 |
+
2023-11-02,11.0,80.0,8.7,46.4,976.4,33.6,21.5
|
19 |
+
2023-11-03,9.6,83.3,7.9,32.4,981.6,31,40.1
|
pollution_data.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
date,NO2,O3
|
2 |
+
2024-10-17,22.804605103280675,22.769159859976643
|
3 |
+
2024-10-18,23.26858769887009,23.30733245729302
|
4 |
+
2024-10-19,23.91006441223834,23.1717142857143
|
5 |
+
2024-10-20,22.57323754789273,23.53784452296821
|
6 |
+
2024-10-21,21.1457004830918,24.02069565217393
|
7 |
+
2024-10-22,21.77657980456027,23.33588571428572
|
8 |
+
2024-10-23,21.974793814433,22.21468879668051
|
9 |
+
2024-10-24,25.51256756756757,20.91370967741937
|
10 |
+
2024-10-25,21.72051282051282,22.33230769230769
|
11 |
+
2024-10-26,24.46423484380123,18.70331123489324
|
12 |
+
2024-10-27,27.53722134983982,20.80809239842384
|
13 |
+
2024-10-28,23.337567567567568,26.82861788617886
|
14 |
+
2024-10-29,16.53533209586906,23.28254887605004
|
15 |
+
2024-10-30,22.26162162162162,18.03443548387097
|
16 |
+
2024-10-31,24.919333333333334,20.79696
|
predictions_history.csv
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pollutant,date_predicted,date,prediction_value
|
2 |
+
O3,2024-10-14,2024-10-17,31.25335185244893
|
3 |
+
NO2,2024-10-14,2024-10-17,26.421736787446267
|
4 |
+
O3,2024-10-15,2024-10-17,22.00005767760448
|
5 |
+
NO2,2024-10-15,2024-10-17,28.59511317503212
|
6 |
+
O3,2024-10-16,2024-10-17,9.657466070999735
|
7 |
+
NO2,2024-10-16,2024-10-17,17.065168790519902
|
8 |
+
O3,2024-10-15,2024-10-18,6.561248
|
9 |
+
NO2,2024-10-15,2024-10-18,26.443672
|
10 |
+
O3,2024-10-16,2024-10-18,19.782418
|
11 |
+
NO2,2024-10-16,2024-10-18,36.453956
|
12 |
+
O3,2024-10-17,2024-10-18,16.08841798553393
|
13 |
+
NO2,2024-10-17,2024-10-18,32.0458143607889
|
14 |
+
O3,2024-10-16,2024-10-19,24.031357603260783
|
15 |
+
NO2,2024-10-16,2024-10-19,20.08389395558791
|
16 |
+
O3,2024-10-17,2024-10-19,21.031357603260783
|
17 |
+
NO2,2024-10-17,2024-10-19,27.08389395558791
|
18 |
+
O3,2024-10-17,2024-10-20,20.48486247979324
|
19 |
+
NO2,2024-10-17,2024-10-20,23.84300578029378
|
20 |
+
O3,2024-10-18,2024-10-19,22.304547122637445
|
21 |
+
NO2,2024-10-18,2024-10-19,20.80017116560889
|
22 |
+
O3,2024-10-18,2024-10-20,31.25335185244893
|
23 |
+
NO2,2024-10-18,2024-10-20,29.732316066240585
|
24 |
+
O3,2024-10-18,2024-10-21,28.67755196805434
|
25 |
+
NO2,2024-10-18,2024-10-21,35.04638743773354
|
26 |
+
O3,2024-10-19,2024-10-20,26.421736787446267
|
27 |
+
NO2,2024-10-19,2024-10-20,27.399885723190767
|
28 |
+
O3,2024-10-19,2024-10-21,17.065168790519902
|
29 |
+
NO2,2024-10-19,2024-10-21,18.992352714813563
|
30 |
+
O3,2024-10-19,2024-10-22,17.39682962048955
|
31 |
+
NO2,2024-10-19,2024-10-22,22.85061675885908
|
32 |
+
O3,2024-10-20,2024-10-21,22.00005767760448
|
33 |
+
NO2,2024-10-20,2024-10-21,18.27191592927812
|
34 |
+
O3,2024-10-20,2024-10-22,29.00940466937953
|
35 |
+
NO2,2024-10-20,2024-10-22,19.50739766963497
|
36 |
+
O3,2024-10-20,2024-10-23,20.062134354543343
|
37 |
+
NO2,2024-10-20,2024-10-23,23.65746607099973
|
38 |
+
O3,2024-10-21,2024-10-22,17.497382318189132
|
39 |
+
NO2,2024-10-21,2024-10-22,28.59511317503212
|
40 |
+
O3,2024-10-21,2024-10-23,16.519952190354232
|
41 |
+
NO2,2024-10-21,2024-10-23,30.192389708351826
|
42 |
+
O3,2024-10-21,2024-10-24,28.19940385112904
|
43 |
+
NO2,2024-10-21,2024-10-24,17.9525039623211
|
44 |
+
O3,2024-10-22,2024-10-23,16.093074246425157
|
45 |
+
NO2,2024-10-22,2024-10-23,25.217639978187005
|
46 |
+
O3,2024-10-22,2024-10-24,23.605545201596552
|
47 |
+
NO2,2024-10-22,2024-10-24,29.004701753536988
|
48 |
+
O3,2024-10-23,2024-10-24,26.56486295059828
|
49 |
+
NO2,2024-10-23,2024-10-24,20.15373733747257
|
50 |
+
O3,2024-10-24,2024-10-25,10.33808859423279
|
51 |
+
NO2,2024-10-24,2024-10-25,25.68519991558237
|
52 |
+
O3,2024-10-24,2024-10-26,16.000984317626852
|
53 |
+
NO2,2024-10-24,2024-10-26,25.760307451092384
|
54 |
+
O3,2024-10-24,2024-10-27,19.64377495640328
|
55 |
+
NO2,2024-10-24,2024-10-27,31.210576791105115
|
56 |
+
O3,2024-10-25,2024-10-26,20.48055947200643
|
57 |
+
NO2,2024-10-25,2024-10-26,23.95723903986424
|
58 |
+
O3,2024-10-25,2024-10-27,11.088152958498888
|
59 |
+
NO2,2024-10-25,2024-10-27,32.274494671100506
|
60 |
+
O3,2024-10-25,2024-10-28,-0.7175631399505704
|
61 |
+
NO2,2024-10-25,2024-10-28,40.86107800019054
|
62 |
+
O3,2024-10-28,2024-10-29,22.13652238154496
|
63 |
+
NO2,2024-10-28,2024-10-29,31.608886931951144
|
64 |
+
O3,2024-10-28,2024-10-30,15.841669224
|
65 |
+
NO2,2024-10-28,2024-10-30,34.564284711452984
|
66 |
+
O3,2024-10-28,2024-10-31,22.35944571003375
|
67 |
+
NO2,2024-10-28,2024-10-31,34.37482132111927
|
68 |
+
O3,2024-10-30,2024-10-31,15.98046542733637
|
69 |
+
NO2,2024-10-30,2024-10-31,29.77507241979599
|
70 |
+
O3,2024-10-30,2024-11-01,21.135906183680472
|
71 |
+
NO2,2024-10-30,2024-11-01,28.38872595850704
|
72 |
+
O3,2024-10-30,2024-11-02,19.67426015042635
|
73 |
+
O3,2024-10-31,2024-11-01,16.491393851863755
|
74 |
+
NO2,2024-10-31,2024-11-01,17.22825222459993
|
75 |
+
O3,2024-10-31,2024-11-02,16.874728806873033
|
76 |
+
NO2,2024-10-31,2024-11-02,14.771381333796965
|
77 |
+
O3,2024-10-31,2024-11-03,15.244292496093546
|
78 |
+
NO2,2024-10-31,2024-11-03,14.606430068166452
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
joblib # or pickle if you're using that to load the model
|
5 |
+
scikit-learn # for mock model
|
6 |
+
altair
|
7 |
+
matplotlib
|
8 |
+
plotly
|
9 |
+
http.client
|
10 |
+
datetime
|
11 |
+
huggingface-hub
|
12 |
+
python-dotenv
|
13 |
+
torch
|
14 |
+
safetensors
|
src/data_api_calls.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import codecs
|
2 |
+
import csv
|
3 |
+
import http.client
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import sys
|
7 |
+
import urllib.request
|
8 |
+
from datetime import date, timedelta
|
9 |
+
from io import StringIO
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
|
13 |
+
WEATHER_DATA_FILE = "weather_data.csv"
|
14 |
+
POLLUTION_DATA_FILE = "pollution_data.csv"
|
15 |
+
|
16 |
+
|
17 |
+
def update_weather_data():
|
18 |
+
today = date.today().isoformat()
|
19 |
+
|
20 |
+
if os.path.exists(WEATHER_DATA_FILE):
|
21 |
+
df = pd.read_csv(WEATHER_DATA_FILE)
|
22 |
+
last_date = pd.to_datetime(df["date"]).max()
|
23 |
+
start_date = (last_date + timedelta(1)).isoformat()
|
24 |
+
else:
|
25 |
+
df = pd.DataFrame()
|
26 |
+
start_date = (date.today() - timedelta(7)).isoformat()
|
27 |
+
|
28 |
+
try:
|
29 |
+
ResultBytes = urllib.request.urlopen(
|
30 |
+
f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
|
31 |
+
)
|
32 |
+
CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
|
33 |
+
|
34 |
+
new_data = pd.DataFrame(list(CSVText))
|
35 |
+
new_data.columns = new_data.iloc[0]
|
36 |
+
new_data = new_data[1:]
|
37 |
+
new_data = new_data.rename(columns={"datetime": "date"})
|
38 |
+
|
39 |
+
updated_df = pd.concat([df, new_data], ignore_index=True)
|
40 |
+
updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
|
41 |
+
updated_df.to_csv(WEATHER_DATA_FILE, index=False)
|
42 |
+
|
43 |
+
except urllib.error.HTTPError as e:
|
44 |
+
ErrorInfo = e.read().decode()
|
45 |
+
print("Error code: ", e.code, ErrorInfo)
|
46 |
+
sys.exit()
|
47 |
+
except urllib.error.URLError as e:
|
48 |
+
ErrorInfo = e.read().decode()
|
49 |
+
print("Error code: ", e.code, ErrorInfo)
|
50 |
+
sys.exit()
|
51 |
+
|
52 |
+
|
53 |
+
def update_pollution_data():
|
54 |
+
O3 = []
|
55 |
+
NO2 = []
|
56 |
+
particles = ["NO2", "O3"]
|
57 |
+
stations = ["NL10636", "NL10639", "NL10643"]
|
58 |
+
all_dataframes = []
|
59 |
+
today = date.today().isoformat() + "T09:00:00Z"
|
60 |
+
yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
|
61 |
+
|
62 |
+
if os.path.exists(POLLUTION_DATA_FILE):
|
63 |
+
existing_data = pd.read_csv(POLLUTION_DATA_FILE)
|
64 |
+
last_date = pd.to_datetime(existing_data["date"]).max()
|
65 |
+
if last_date >= pd.Timestamp(date.today()):
|
66 |
+
print("Data is already up to date.")
|
67 |
+
return
|
68 |
+
|
69 |
+
# Only pull data for today if not already updated
|
70 |
+
for particle in particles:
|
71 |
+
for station in stations:
|
72 |
+
conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
|
73 |
+
payload = ""
|
74 |
+
headers = {}
|
75 |
+
conn.request(
|
76 |
+
"GET",
|
77 |
+
f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
|
78 |
+
payload,
|
79 |
+
headers,
|
80 |
+
)
|
81 |
+
res = conn.getresponse()
|
82 |
+
data = res.read()
|
83 |
+
decoded_data = data.decode("utf-8")
|
84 |
+
df = pd.read_csv(StringIO(decoded_data))
|
85 |
+
df = df.filter(like="value")
|
86 |
+
all_dataframes.append(df)
|
87 |
+
combined_data = pd.concat(all_dataframes, ignore_index=True)
|
88 |
+
values = []
|
89 |
+
|
90 |
+
for row in combined_data:
|
91 |
+
cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
|
92 |
+
if cleaned_value:
|
93 |
+
values.append(float(cleaned_value[0]))
|
94 |
+
|
95 |
+
if values:
|
96 |
+
avg = sum(values) / len(values)
|
97 |
+
if particle == "NO2":
|
98 |
+
NO2.append(avg)
|
99 |
+
else:
|
100 |
+
O3.append(avg)
|
101 |
+
|
102 |
+
new_data = pd.DataFrame(
|
103 |
+
{
|
104 |
+
"date": [date.today()],
|
105 |
+
"NO2": NO2,
|
106 |
+
"O3": O3,
|
107 |
+
}
|
108 |
+
)
|
109 |
+
|
110 |
+
updated_data = pd.concat([existing_data, new_data], ignore_index=True)
|
111 |
+
updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
|
112 |
+
|
113 |
+
updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
|
114 |
+
|
115 |
+
|
116 |
+
def get_combined_data():
|
117 |
+
|
118 |
+
weather_df = pd.read_csv(WEATHER_DATA_FILE)
|
119 |
+
|
120 |
+
today = pd.Timestamp.now().normalize()
|
121 |
+
seven_days_ago = today - pd.Timedelta(days=7)
|
122 |
+
weather_df["date"] = pd.to_datetime(weather_df["date"])
|
123 |
+
weather_df = weather_df[(weather_df["date"] >= seven_days_ago) & (weather_df["date"] <= today)]
|
124 |
+
|
125 |
+
weather_df.insert(1, "NO2", None)
|
126 |
+
weather_df.insert(2, "O3", None)
|
127 |
+
weather_df.insert(10, "weekday", None)
|
128 |
+
columns = list(weather_df.columns)
|
129 |
+
columns.insert(3, columns.pop(6))
|
130 |
+
weather_df = weather_df[columns]
|
131 |
+
columns.insert(5, columns.pop(9))
|
132 |
+
weather_df = weather_df[columns]
|
133 |
+
columns.insert(9, columns.pop(6))
|
134 |
+
weather_df = weather_df[columns]
|
135 |
+
|
136 |
+
combined_df = weather_df
|
137 |
+
|
138 |
+
# Apply scaling and renaming similar to the scale function from previous code
|
139 |
+
combined_df = combined_df.rename(
|
140 |
+
columns={
|
141 |
+
"date": "date",
|
142 |
+
"windspeed": "wind_speed",
|
143 |
+
"temp": "mean_temp",
|
144 |
+
"solarradiation": "global_radiation",
|
145 |
+
"precip": "percipitation",
|
146 |
+
"sealevelpressure": "pressure",
|
147 |
+
"visibility": "minimum_visibility",
|
148 |
+
}
|
149 |
+
)
|
150 |
+
|
151 |
+
combined_df["date"] = pd.to_datetime(combined_df["date"])
|
152 |
+
combined_df["weekday"] = combined_df["date"].dt.day_name()
|
153 |
+
|
154 |
+
combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
|
155 |
+
combined_df["mean_temp"] = combined_df["mean_temp"] * 10
|
156 |
+
combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
|
157 |
+
combined_df["percipitation"] = combined_df["percipitation"] * 10
|
158 |
+
combined_df["pressure"] = combined_df["pressure"] * 10
|
159 |
+
|
160 |
+
combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
|
161 |
+
combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
|
162 |
+
combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
|
163 |
+
combined_df["percipitation"] = combined_df["percipitation"].astype(int)
|
164 |
+
combined_df["pressure"] = combined_df["pressure"].astype(int)
|
165 |
+
combined_df["humidity"] = combined_df["humidity"].astype(int)
|
166 |
+
combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
|
167 |
+
|
168 |
+
pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
|
169 |
+
|
170 |
+
pollution_df["date"] = pd.to_datetime(pollution_df["date"])
|
171 |
+
pollution_df = pollution_df[(pollution_df["date"] >= seven_days_ago) & (pollution_df["date"] <= today)]
|
172 |
+
|
173 |
+
combined_df["NO2"] = pollution_df["NO2"]
|
174 |
+
combined_df["O3"] = pollution_df["O3"]
|
175 |
+
|
176 |
+
return combined_df
|
src/features_pipeline.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import warnings
|
3 |
+
|
4 |
+
import joblib
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from huggingface_hub import hf_hub_download, login
|
9 |
+
|
10 |
+
from src.past_data_api_calls import get_past_combined_data
|
11 |
+
|
12 |
+
warnings.filterwarnings("ignore")
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
|
16 |
+
|
17 |
+
|
18 |
+
def create_features(
|
19 |
+
data,
|
20 |
+
target_particle, # Added this parameter
|
21 |
+
lag_days=7,
|
22 |
+
sma_days=7,
|
23 |
+
):
|
24 |
+
lag_features = [
|
25 |
+
"NO2",
|
26 |
+
"O3",
|
27 |
+
"wind_speed",
|
28 |
+
"mean_temp",
|
29 |
+
"global_radiation",
|
30 |
+
"minimum_visibility",
|
31 |
+
"humidity",
|
32 |
+
]
|
33 |
+
if target_particle == "NO2":
|
34 |
+
lag_features = lag_features + ["percipitation", "pressure"]
|
35 |
+
|
36 |
+
if target_particle not in ["O3", "NO2"]:
|
37 |
+
raise ValueError("target_particle must be 'O3' or 'NO2'")
|
38 |
+
|
39 |
+
data = data.copy()
|
40 |
+
data["date"] = pd.to_datetime(data["date"])
|
41 |
+
data = data.sort_values("date").reset_index(drop=True)
|
42 |
+
|
43 |
+
# Extract 'weekday' and 'month' from 'date' if not present
|
44 |
+
if "weekday" not in data.columns or data["weekday"].dtype == object:
|
45 |
+
data["weekday"] = data["date"].dt.weekday # Monday=0, Sunday=6
|
46 |
+
if "month" not in data.columns:
|
47 |
+
data["month"] = data["date"].dt.month # 1 to 12
|
48 |
+
|
49 |
+
# Create sine and cosine transformations for 'weekday' and 'month'
|
50 |
+
data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
|
51 |
+
data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
|
52 |
+
data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
|
53 |
+
data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
|
54 |
+
|
55 |
+
# Create lagged features for the specified lag days
|
56 |
+
for feature in lag_features:
|
57 |
+
for lag in range(1, lag_days + 1):
|
58 |
+
data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)
|
59 |
+
|
60 |
+
# Create SMA features
|
61 |
+
for feature in lag_features:
|
62 |
+
data[f"{feature}_sma_{sma_days}"] = (
|
63 |
+
data[feature].rolling(window=sma_days).mean()
|
64 |
+
)
|
65 |
+
|
66 |
+
# Create particle data (NO2 and O3) from the same time last year
|
67 |
+
past_data = get_past_combined_data()
|
68 |
+
|
69 |
+
# Today last year
|
70 |
+
data["O3_last_year"] = past_data["O3"].iloc[-4]
|
71 |
+
data["NO2_last_year"] = past_data["NO2"].iloc[-4]
|
72 |
+
|
73 |
+
# 7 days before today last year
|
74 |
+
for i in range(1, lag_days + 1):
|
75 |
+
data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
|
76 |
+
data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
|
77 |
+
|
78 |
+
# 3 days after today last year
|
79 |
+
data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
|
80 |
+
data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
|
81 |
+
|
82 |
+
# Drop missing values
|
83 |
+
rows_before = data.shape[0]
|
84 |
+
data = data.dropna().reset_index(drop=True)
|
85 |
+
rows_after = data.shape[0]
|
86 |
+
rows_dropped = rows_before - rows_after
|
87 |
+
print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}")
|
88 |
+
print(data)
|
89 |
+
|
90 |
+
# Ensure the data is sorted by date in ascending order
|
91 |
+
data = data.sort_values("date").reset_index(drop=True)
|
92 |
+
|
93 |
+
# Define feature columns
|
94 |
+
exclude_cols = ["date", "weekday", "month"]
|
95 |
+
feature_cols = [col for col in data.columns if col not in exclude_cols]
|
96 |
+
|
97 |
+
# Split features and targets
|
98 |
+
x = data[feature_cols]
|
99 |
+
|
100 |
+
# Scale
|
101 |
+
repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model"
|
102 |
+
file_name = f"feature_scaler_{target_particle}.joblib"
|
103 |
+
path = hf_hub_download(repo_id=repo_id, filename=file_name)
|
104 |
+
feature_scaler = joblib.load(path)
|
105 |
+
X_scaled = feature_scaler.transform(x)
|
106 |
+
|
107 |
+
# Convert scaled data back to DataFrame for consistency
|
108 |
+
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
|
109 |
+
|
110 |
+
return X_scaled
|
src/helper_functions.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
# Custom function to create styled metric boxes with compact layout
|
5 |
+
def custom_metric_box(label, value):
|
6 |
+
st.markdown(f"""
|
7 |
+
<div style="
|
8 |
+
padding: 5px;
|
9 |
+
margin-bottom: 5px;
|
10 |
+
width: 100%; /* Full width */
|
11 |
+
display: flex;
|
12 |
+
flex-direction: column; /* Align items vertically */
|
13 |
+
align-items: flex-start; /* Align all content to the left */
|
14 |
+
">
|
15 |
+
<div>
|
16 |
+
<h4 style="font-size: 14px; font-weight: normal; margin: 0;">{label}</h4> <!-- Smaller label -->
|
17 |
+
</div>
|
18 |
+
<div>
|
19 |
+
<p style="font-size: 18px; font-weight: bold; margin: 0;">{value}</p> <!-- Smaller metric -->
|
20 |
+
</div>
|
21 |
+
</div>
|
22 |
+
""", unsafe_allow_html=True)
|
23 |
+
|
24 |
+
# Custom function to create pollution metric boxes with side-by-side layout for label and value
|
25 |
+
# Custom function to create pollution metric boxes with side-by-side layout and fixed width
|
26 |
+
def pollution_box(label, value, delta, threshold):
|
27 |
+
# Determine if the pollution level is "Good" or "Bad"
|
28 |
+
status = "Good" if float(value.split()[0]) < threshold else "Bad"
|
29 |
+
status_color = "#77C124" if status == "Good" else "#E68B0A"
|
30 |
+
|
31 |
+
# Render the pollution box
|
32 |
+
st.markdown(f"""
|
33 |
+
<div style="
|
34 |
+
background: rgba(255, 255, 255, 0.05);
|
35 |
+
border-radius: 16px;
|
36 |
+
box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
|
37 |
+
backdrop-filter: blur(5px);
|
38 |
+
-webkit-backdrop-filter: blur(5px);
|
39 |
+
border: 1px solid rgba(255, 255, 255, 0.15);
|
40 |
+
padding: 15px;
|
41 |
+
margin-bottom: 10px;
|
42 |
+
">
|
43 |
+
<h4 style="font-size: 24px; font-weight: bold; margin: 0;">{label}</h4> <!-- Bigger label -->
|
44 |
+
<p style="font-size: 36px; font-weight: bold; color: {status_color}; margin: 0;">{status}</p> <!-- Good/Bad with color -->
|
45 |
+
<p style="font-size: 18px; margin: 0;">{value}</p> <!-- Smaller value where delta used to be -->
|
46 |
+
</div>
|
47 |
+
""", unsafe_allow_html=True)
|
src/past_data_api_calls.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import codecs
|
2 |
+
import csv
|
3 |
+
import http.client
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import sys
|
7 |
+
import urllib.request
|
8 |
+
from datetime import date, timedelta
|
9 |
+
from io import StringIO
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
|
13 |
+
PAST_WEATHER_DATA_FILE = "past_weather_data.csv"
|
14 |
+
PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
|
15 |
+
|
16 |
+
|
17 |
+
def update_past_weather_data():
|
18 |
+
last_year_date = date.today() - timedelta(days=365)
|
19 |
+
|
20 |
+
if os.path.exists(PAST_WEATHER_DATA_FILE):
|
21 |
+
df = pd.read_csv(PAST_WEATHER_DATA_FILE)
|
22 |
+
start_date = pd.to_datetime(df["date"]).max().date().isoformat()
|
23 |
+
end_date = (last_year_date + timedelta(days=2)).isoformat()
|
24 |
+
else:
|
25 |
+
df = pd.DataFrame()
|
26 |
+
start_date = (last_year_date - timedelta(days=8)).isoformat()
|
27 |
+
end_date = (last_year_date + timedelta(days=2)).isoformat()
|
28 |
+
|
29 |
+
try:
|
30 |
+
ResultBytes = urllib.request.urlopen(
|
31 |
+
f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
|
32 |
+
)
|
33 |
+
CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
|
34 |
+
|
35 |
+
data = pd.DataFrame(list(CSVText))
|
36 |
+
data.columns = data.iloc[0]
|
37 |
+
data = data[1:]
|
38 |
+
data = data.rename(columns={"datetime": "date"})
|
39 |
+
|
40 |
+
updated_df = pd.concat([df, data], ignore_index=True)
|
41 |
+
updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
|
42 |
+
updated_df.to_csv(PAST_WEATHER_DATA_FILE, index=False)
|
43 |
+
|
44 |
+
except urllib.error.HTTPError as e:
|
45 |
+
ErrorInfo = e.read().decode()
|
46 |
+
print("Error code: ", e.code, ErrorInfo)
|
47 |
+
sys.exit()
|
48 |
+
except urllib.error.URLError as e:
|
49 |
+
ErrorInfo = e.read().decode()
|
50 |
+
print("Error code: ", e.code, ErrorInfo)
|
51 |
+
sys.exit()
|
52 |
+
|
53 |
+
|
54 |
+
def update_past_pollution_data():
|
55 |
+
O3 = []
|
56 |
+
NO2 = []
|
57 |
+
particles = ["NO2", "O3"]
|
58 |
+
stations = ["NL10636", "NL10639", "NL10643"]
|
59 |
+
all_dataframes = []
|
60 |
+
|
61 |
+
last_year_date = date.today() - timedelta(days=365)
|
62 |
+
|
63 |
+
if os.path.exists(PAST_POLLUTION_DATA_FILE):
|
64 |
+
existing_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
|
65 |
+
last_date = pd.to_datetime(existing_data["date"]).max()
|
66 |
+
if last_date >= pd.to_datetime(last_year_date):
|
67 |
+
print("Data is already up to date.")
|
68 |
+
return
|
69 |
+
else:
|
70 |
+
start_date = last_date.date()
|
71 |
+
end_date = last_year_date + timedelta(days=3)
|
72 |
+
else:
|
73 |
+
existing_data = pd.DataFrame()
|
74 |
+
start_date = last_year_date - timedelta(days=7)
|
75 |
+
end_date = last_year_date + timedelta(days=3)
|
76 |
+
|
77 |
+
date_list = [
|
78 |
+
start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
|
79 |
+
]
|
80 |
+
for current_date in date_list:
|
81 |
+
today = current_date.isoformat() + "T09:00:00Z"
|
82 |
+
yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
|
83 |
+
for particle in particles:
|
84 |
+
all_dataframes = [] # Reset for each particle
|
85 |
+
for station in stations:
|
86 |
+
conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
|
87 |
+
payload = ""
|
88 |
+
headers = {}
|
89 |
+
conn.request(
|
90 |
+
"GET",
|
91 |
+
f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
|
92 |
+
payload,
|
93 |
+
headers,
|
94 |
+
)
|
95 |
+
res = conn.getresponse()
|
96 |
+
data = res.read()
|
97 |
+
decoded_data = data.decode("utf-8")
|
98 |
+
df = pd.read_csv(StringIO(decoded_data))
|
99 |
+
df = df.filter(like="value")
|
100 |
+
all_dataframes.append(df)
|
101 |
+
|
102 |
+
combined_data = pd.concat(all_dataframes, ignore_index=True)
|
103 |
+
values = []
|
104 |
+
for row in combined_data:
|
105 |
+
cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
|
106 |
+
if cleaned_value:
|
107 |
+
values.append(float(cleaned_value[0]))
|
108 |
+
|
109 |
+
if values:
|
110 |
+
avg = sum(values) / len(values)
|
111 |
+
if particle == "NO2":
|
112 |
+
NO2.append(avg)
|
113 |
+
else:
|
114 |
+
O3.append(avg)
|
115 |
+
|
116 |
+
new_data = pd.DataFrame(
|
117 |
+
{
|
118 |
+
"date": date_list,
|
119 |
+
"NO2": NO2,
|
120 |
+
"O3": O3,
|
121 |
+
}
|
122 |
+
)
|
123 |
+
|
124 |
+
updated_data = pd.concat([existing_data, new_data], ignore_index=True)
|
125 |
+
updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
|
126 |
+
|
127 |
+
updated_data.to_csv(PAST_POLLUTION_DATA_FILE, index=False)
|
128 |
+
|
129 |
+
return NO2, O3
|
130 |
+
|
131 |
+
|
132 |
+
def get_past_combined_data():
|
133 |
+
update_past_weather_data()
|
134 |
+
update_past_pollution_data()
|
135 |
+
|
136 |
+
combined_df = pd.read_csv(PAST_WEATHER_DATA_FILE)
|
137 |
+
pollution_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
|
138 |
+
|
139 |
+
combined_df = combined_df.merge(pollution_data, on="date", how="inner")
|
140 |
+
combined_df = combined_df.tail(11)
|
141 |
+
|
142 |
+
# Apply scaling and renaming similar to the scale function from previous code
|
143 |
+
combined_df = combined_df.rename(
|
144 |
+
columns={
|
145 |
+
"date": "date",
|
146 |
+
"windspeed": "wind_speed",
|
147 |
+
"temp": "mean_temp",
|
148 |
+
"solarradiation": "global_radiation",
|
149 |
+
"precip": "percipitation",
|
150 |
+
"sealevelpressure": "pressure",
|
151 |
+
"visibility": "minimum_visibility",
|
152 |
+
}
|
153 |
+
)
|
154 |
+
|
155 |
+
combined_df["date"] = pd.to_datetime(combined_df["date"])
|
156 |
+
combined_df["weekday"] = combined_df["date"].dt.day_name()
|
157 |
+
|
158 |
+
combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
|
159 |
+
combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
|
160 |
+
combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
|
161 |
+
combined_df["percipitation"] = combined_df["percipitation"].astype(float)
|
162 |
+
combined_df["pressure"] = combined_df["pressure"].astype(float).round()
|
163 |
+
combined_df["humidity"] = combined_df["humidity"].astype(float).round()
|
164 |
+
combined_df["global_radiation"] = combined_df["global_radiation"].astype(float)
|
165 |
+
|
166 |
+
combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
|
167 |
+
combined_df["mean_temp"] = combined_df["mean_temp"] * 10
|
168 |
+
combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
|
169 |
+
combined_df["percipitation"] = combined_df["percipitation"] * 10
|
170 |
+
combined_df["pressure"] = combined_df["pressure"] * 10
|
171 |
+
|
172 |
+
combined_df["wind_speed"] = (
|
173 |
+
combined_df["wind_speed"].astype(float).round().astype(int)
|
174 |
+
)
|
175 |
+
combined_df["mean_temp"] = (
|
176 |
+
combined_df["mean_temp"].astype(float).round().astype(int)
|
177 |
+
)
|
178 |
+
combined_df["minimum_visibility"] = (
|
179 |
+
combined_df["minimum_visibility"].astype(float).round().astype(int)
|
180 |
+
)
|
181 |
+
combined_df["percipitation"] = (
|
182 |
+
combined_df["percipitation"].astype(float).round().astype(int)
|
183 |
+
)
|
184 |
+
combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
|
185 |
+
combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
|
186 |
+
combined_df["global_radiation"] = (
|
187 |
+
combined_df["global_radiation"].astype(float).round().astype(int)
|
188 |
+
)
|
189 |
+
|
190 |
+
return combined_df
|
src/predict.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import date, datetime, timedelta
|
3 |
+
|
4 |
+
import joblib
|
5 |
+
import pandas as pd
|
6 |
+
import torch
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from huggingface_hub import hf_hub_download, login
|
9 |
+
|
10 |
+
from src.data_api_calls import (
|
11 |
+
get_combined_data,
|
12 |
+
update_pollution_data,
|
13 |
+
update_weather_data,
|
14 |
+
)
|
15 |
+
from src.features_pipeline import create_features
|
16 |
+
|
17 |
+
load_dotenv()
|
18 |
+
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
|
19 |
+
|
20 |
+
|
21 |
+
def load_nn():
|
22 |
+
import torch.nn as nn
|
23 |
+
from huggingface_hub import PyTorchModelHubMixin
|
24 |
+
|
25 |
+
class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
|
26 |
+
def __init__(self, input_size, layers, dropout_rate):
|
27 |
+
super(AirPollutionNet, self).__init__()
|
28 |
+
self.layers_list = nn.ModuleList()
|
29 |
+
in_features = input_size
|
30 |
+
|
31 |
+
for units in layers:
|
32 |
+
self.layers_list.append(nn.Linear(in_features, units))
|
33 |
+
self.layers_list.append(nn.ReLU())
|
34 |
+
self.layers_list.append(nn.Dropout(p=dropout_rate))
|
35 |
+
in_features = units
|
36 |
+
|
37 |
+
self.output = nn.Linear(in_features, 3) # Output size is 3 for next 3 days
|
38 |
+
|
39 |
+
def forward(self, x):
|
40 |
+
for layer in self.layers_list:
|
41 |
+
x = layer(x)
|
42 |
+
x = self.output(x)
|
43 |
+
return x
|
44 |
+
|
45 |
+
model = AirPollutionNet.from_pretrained(
|
46 |
+
"akseljoonas/Utrecht_pollution_forecasting_NO2"
|
47 |
+
)
|
48 |
+
return model
|
49 |
+
|
50 |
+
|
51 |
+
def load_model(particle):
|
52 |
+
repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
|
53 |
+
if particle == "O3":
|
54 |
+
file_name = "O3_svr_model.pkl"
|
55 |
+
model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
|
56 |
+
model = joblib.load(model_path)
|
57 |
+
else:
|
58 |
+
model = load_nn()
|
59 |
+
|
60 |
+
return model
|
61 |
+
|
62 |
+
|
63 |
+
def run_model(particle, data):
|
64 |
+
input_data = create_features(data=data, target_particle=particle)
|
65 |
+
model = load_model(particle)
|
66 |
+
|
67 |
+
if particle == "NO2":
|
68 |
+
with torch.no_grad():
|
69 |
+
prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
|
70 |
+
repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
|
71 |
+
file_name = "target_scaler_NO2.joblib"
|
72 |
+
path = hf_hub_download(repo_id=repo_id, filename=file_name)
|
73 |
+
else:
|
74 |
+
prediction = model.predict(input_data)
|
75 |
+
|
76 |
+
repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
|
77 |
+
file_name = f"target_scaler_{particle}.joblib"
|
78 |
+
path = hf_hub_download(repo_id=repo_id, filename=file_name)
|
79 |
+
|
80 |
+
target_scaler = joblib.load(path)
|
81 |
+
prediction = target_scaler.inverse_transform(prediction)
|
82 |
+
|
83 |
+
return prediction
|
84 |
+
|
85 |
+
|
86 |
+
def update_data_and_predictions():
|
87 |
+
update_weather_data()
|
88 |
+
update_pollution_data()
|
89 |
+
|
90 |
+
week_data = get_combined_data()
|
91 |
+
|
92 |
+
o3_predictions = run_model("O3", data=week_data)
|
93 |
+
no2_predictions = run_model("NO2", data=week_data)
|
94 |
+
|
95 |
+
prediction_data = []
|
96 |
+
for i in range(3):
|
97 |
+
prediction_data.append(
|
98 |
+
{
|
99 |
+
"pollutant": "O3",
|
100 |
+
"date_predicted": date.today(),
|
101 |
+
"date": date.today() + timedelta(days=i + 1),
|
102 |
+
"prediction_value": o3_predictions[0][i],
|
103 |
+
}
|
104 |
+
)
|
105 |
+
prediction_data.append(
|
106 |
+
{
|
107 |
+
"pollutant": "NO2",
|
108 |
+
"date_predicted": date.today(),
|
109 |
+
"date": date.today() + timedelta(days=i + 1),
|
110 |
+
"prediction_value": no2_predictions[0][i],
|
111 |
+
}
|
112 |
+
)
|
113 |
+
|
114 |
+
predictions_df = pd.DataFrame(prediction_data)
|
115 |
+
|
116 |
+
PREDICTIONS_FILE = "predictions_history.csv"
|
117 |
+
|
118 |
+
if os.path.exists(PREDICTIONS_FILE):
|
119 |
+
existing_data = pd.read_csv(PREDICTIONS_FILE)
|
120 |
+
# Filter out predictions made today to avoid duplicates
|
121 |
+
existing_data = existing_data[
|
122 |
+
~(existing_data["date_predicted"] == str(date.today()))
|
123 |
+
]
|
124 |
+
combined_data = pd.concat([existing_data, predictions_df])
|
125 |
+
combined_data.drop_duplicates()
|
126 |
+
else:
|
127 |
+
combined_data = predictions_df
|
128 |
+
|
129 |
+
combined_data.to_csv(PREDICTIONS_FILE, index=False)
|
130 |
+
|
131 |
+
|
132 |
+
def get_data_and_predictions():
|
133 |
+
week_data = get_combined_data()
|
134 |
+
|
135 |
+
PREDICTIONS_FILE = "predictions_history.csv"
|
136 |
+
data = pd.read_csv(PREDICTIONS_FILE)
|
137 |
+
|
138 |
+
today = datetime.today().strftime("%Y-%m-%d")
|
139 |
+
today_predictions = data[(data["date_predicted"] == today)]
|
140 |
+
|
141 |
+
# Extract predictions for O3 and NO2
|
142 |
+
o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
|
143 |
+
"prediction_value"
|
144 |
+
].values
|
145 |
+
no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
|
146 |
+
"prediction_value"
|
147 |
+
].values
|
148 |
+
|
149 |
+
return week_data, [o3_predictions], [no2_predictions]
|
150 |
+
|
151 |
+
if __name__=="__main__":
|
152 |
+
update_data_and_predictions()
|
weather_data.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
|
2 |
+
2024-10-17,16.9,86.0,0.6,18.4,1010.0,37.1,43.0
|
3 |
+
2024-10-18,15.5,97.3,3.9,7.6,1014.0,4.5,42.9
|
4 |
+
2024-10-19,14.7,89.9,1.6,14.8,1014.1,22.8,43.5
|
5 |
+
2024-10-20,15.5,83.8,0.5,29.5,1016.0,41.5,0.0
|
6 |
+
2024-10-21,14.4,92.7,4.3,21.2,1020.6,22.0,27.8
|
7 |
+
2024-10-22,11.4,92.8,4.9,19.4,1026.9,22.6,57.0
|
8 |
+
2024-10-23,11.2,97.3,0.0,13.0,1032.8,6.5,12.5
|
9 |
+
2024-10-24,10.4,94.0,0.0,20.5,1024.7,13.0,62.5
|
10 |
+
2024-10-25,13.6,92.2,0.5,11.9,1016.8,24.0,93.0
|
11 |
+
2024-10-26,13.7,91.5,0.0,11.9,1016.3,23.3,8.0
|
12 |
+
2024-10-27,13.2,87.1,0.1,20.5,1019.4,10.4,28.6
|
13 |
+
2024-10-28,12.4,91.8,1.1,31.7,1021.8,12.8,27.3
|
14 |
+
2024-10-29,13.8,95.9,0.2,20.5,1023.1,8.1,16.0
|
15 |
+
2024-10-30,12.7,92.9,0.6,9.4,1027.5,12.5,32.8
|
16 |
+
2024-10-31,12.5,89.9,0.0,11.2,1027.1,17.1,70.6
|