Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

akseljoonas commited on Oct 31, 2024

Commit

359c749

0 Parent(s):

Update .gitlab-ci.yml file

Browse files

Files changed (19) hide show

.gitattributes +35 -0
.gitignore +6 -0
README.md +13 -0
__pycache__/data_api_calls.cpython-312.pyc +0 -0
__pycache__/data_loading.cpython-312.pyc +0 -0
__pycache__/helper_functions.cpython-312.pyc +0 -0
app.py +247 -0
pages/admin.py +234 -0
past_pollution_data.csv +18 -0
past_weather_data.csv +19 -0
pollution_data.csv +16 -0
predictions_history.csv +78 -0
requirements.txt +14 -0
src/data_api_calls.py +176 -0
src/features_pipeline.py +110 -0
src/helper_functions.py +47 -0
src/past_data_api_calls.py +190 -0
src/predict.py +152 -0
weather_data.csv +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.venv/
+.env
+__pycache__/
+*.pyc
+*.joblib
+scalers/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Utrecht Pollution Prediction
+emoji: 🦀
+colorFrom: yellow
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.39.0
+app_file: app.py
+pinned: false
+short_description: 'Demo: Model to predict O3 and NO2 concentrations in Utrecht'
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/data_api_calls.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

__pycache__/data_loading.cpython-312.pyc ADDED Viewed

Binary file (7.96 kB). View file

__pycache__/helper_functions.cpython-312.pyc ADDED Viewed

Binary file (2 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import altair as alt
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from src.helper_functions import custom_metric_box, pollution_box
+from src.predict import get_data_and_predictions, update_data_and_predictions
+st.set_page_config(
+    page_title="Utrecht Pollution Dashboard ",
+    page_icon="🌱",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+alt.themes.enable("dark")
+update_data_and_predictions()
+week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()
+today = week_data.iloc[-1]
+previous_day = week_data.iloc[-2]
+dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
+dates_future = pd.date_range(
+    start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
+).to_list()
+# O3 and NO2 values for the past 7 days
+o3_past_values = week_data["O3"]
+no2_past_values = week_data["NO2"]
+o3_future_values = pd.Series(predictions_O3[0].flatten())
+no2_future_values = pd.Series(predictions_NO2[0].flatten())
+o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
+no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
+dates = dates_past + dates_future
+df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
+# App Title
+st.title("Utrecht Pollution Dashboard 🌱")
+col1, col2 = st.columns((1, 3))
+# Create a 3-column layout
+with col1:
+    st.subheader("Current Weather")
+    custom_metric_box(
+        label="🥵 Temperature",
+        value=f"{round(today['mean_temp'] * 0.1)} °C",
+    )
+    custom_metric_box(
+        label="💧 Humidity",
+        value=f"{round(today['humidity'])} %",
+    )
+    custom_metric_box(
+        label="🪨 Pressure",
+        value=f"{round(today['pressure'] * 0.1)} hPa",
+    )
+    custom_metric_box(
+        label="🌧️ Precipitation",
+        value=f"{round(today['percipitation'] * 0.1)} mm",
+    )
+    custom_metric_box(
+        label="🌤️ Solar Radiation",
+        value=f"{round(today['global_radiation'])} J/m²",
+    )
+    custom_metric_box(
+        label="🌪️ Wind Speed",
+        value=f"{round(today['wind_speed'] * 0.1, 1)} m/s",
+    )
+with col2:
+    st.subheader("Current Pollution Levels")
+    sub1, sub2 = st.columns((1, 1))
+    # Ozone (O₃) Pollution Box
+    with sub1:
+        pollution_box(
+            label="O<sub>3</sub>",
+            value=f"{round(today['O3'])} µg/m³",
+            delta=f"{round(int(today['O3']) - int(previous_day['O3']))} µg/m³",
+            threshold=120
+        )
+        with st.expander("Learn more about O3", expanded=False):
+            st.markdown(
+                """
+                *Ozone (O<sub>3</sub>)*: A harmful gas at ground level that can irritate the respiratory system and aggravate asthma.<br>
+                **Good/Bad**: "Good" means safe levels for most people, while "Bad" suggests harmful levels, especially for sensitive groups.
+                """,
+                unsafe_allow_html=True,
+            )
+    # Nitrogen Dioxide (NO₂) Pollution Box
+    with sub2:
+        pollution_box(
+            label="NO<sub>2</sub>",
+            value=f"{round(today['NO2'])} µg/m³",
+            delta=f"{round(int(today['NO2']) - int(previous_day['NO2']))} µg/m³",
+            threshold=40
+        )
+        with st.expander("Learn more about NO2", expanded=False):
+            st.markdown(
+                """
+                *Nitrogen Dioxide (NO<sub>2</sub>)*: A toxic gas that contributes to lung irritation and worsens asthma and other respiratory issues.<br>
+                **Good/Bad**: "Good" means safe air quality, while "Bad" indicates levels that could cause respiratory problems, especially for vulnerable individuals.
+                """,
+                unsafe_allow_html=True,
+            )
+    # Create two columns for two separate graphs
+    st.subheader("O3 Forecast")
+    # Define the new color logic: green, orange, and red based on the threshold
+    def get_simple_color_scale(values, threshold):
+        """Returns green for values below the threshold, orange for values between the threshold and 2x the threshold, and red for values above 2x the threshold."""
+        return [
+            "#77C124" if v < threshold else
+            "#E68B0A" if v < 2 * threshold else
+            "#E63946" for v in values
+        ]
+    # O3 Bar Plot (threshold: 40)
+    o3_past_values = o3_values[:-3]  # Last 3 values are predictions
+    o3_future_values = o3_values[-3:]  # Last 3 values are predictions
+    o3_colors = get_simple_color_scale(o3_past_values, 40)  # Color for past values
+    fig_o3 = go.Figure()
+    # Add past values
+    fig_o3.add_trace(
+        go.Bar(
+            x=df["Date"][:-3],  # Dates for past values
+            y=o3_past_values,
+            name="O3 Past",
+            marker=dict(color=o3_colors),  # Apply the color scale
+            hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
+        )
+    )
+    # Add predicted values with reduced opacity
+    predicted_o3_colors = get_simple_color_scale(o3_future_values, 40)  # Color for future values
+    fig_o3.add_trace(
+        go.Bar(
+            x=df["Date"][-3:],  # Dates for predicted values
+            y=o3_future_values,
+            name="O3 Predicted",
+            marker=dict(color=predicted_o3_colors, opacity=0.5),  # Set opacity to 0.5 for predictions
+            hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
+        )
+    )
+    fig_o3.add_shape(
+        dict(
+            type="line",
+            x0=pd.Timestamp.today(),
+            x1=pd.Timestamp.today(),
+            y0=min(o3_values),
+            y1=max(o3_values),
+            line=dict(color="White", width=3, dash="dash"),
+        )
+    )
+    fig_o3.update_layout(
+        plot_bgcolor="rgba(0, 0, 0, 0)",
+        paper_bgcolor="rgba(0, 0, 0, 0)",
+        yaxis_title="O3 Concentration (µg/m³)",
+        font=dict(size=14),
+        hovermode="x",
+        xaxis=dict(
+            title="Date",
+            type="date",
+            tickmode="array",
+            tickvals=df["Date"],
+            tickformat="%d-%b",
+            tickangle=-45,
+            tickcolor="gray",
+        ),
+        showlegend=False  # Disable legend
+    )
+    st.plotly_chart(fig_o3, key="fig_o3")
+    # NO2 Bar Plot (threshold: 120)
+    st.subheader("NO2 Forecast")
+    no2_past_values = no2_values[:-3]  # Last 3 values are predictions
+    no2_future_values = no2_values[-3:]  # Last 3 values are predictions
+    no2_colors = get_simple_color_scale(no2_past_values, 120)  # Color for past values
+    fig_no2 = go.Figure()
+    # Add past values
+    fig_no2.add_trace(
+        go.Bar(
+            x=df["Date"][:-3],  # Dates for past values
+            y=no2_past_values,
+            name="NO2 Past",
+            marker=dict(color=no2_colors),  # Apply the color scale
+            hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
+        )
+    )
+    # Add predicted values with reduced opacity
+    predicted_no2_colors = get_simple_color_scale(no2_future_values, 120)  # Color for future values
+    fig_no2.add_trace(
+        go.Bar(
+            x=df["Date"][-3:],  # Dates for predicted values
+            y=no2_future_values,
+            name="NO2 Predicted",
+            marker=dict(color=predicted_no2_colors, opacity=0.5),  # Set opacity to 0.5 for predictions
+            hovertemplate="%{x|%d-%b-%Y}<br>%{y} µg/m³<extra></extra>",
+        )
+    )
+    fig_no2.add_shape(
+        dict(
+            type="line",
+            x0=pd.Timestamp.today(),
+            x1=pd.Timestamp.today(),
+            y0=min(no2_values),
+            y1=max(no2_values),
+            line=dict(color="White", width=3, dash="dash"),
+        )
+    )
+    fig_no2.update_layout(
+        plot_bgcolor="rgba(0, 0, 0, 0)",
+        paper_bgcolor="rgba(0, 0, 0, 0)",
+        yaxis_title="NO<sub>2</sub> Concentration (µg/m³)",
+        font=dict(size=14),
+        hovermode="x",
+        xaxis=dict(
+            title="Date",
+            type="date",
+            tickmode="array",
+            tickvals=df["Date"],
+            tickformat="%d-%b",
+            tickangle=-45,
+            tickcolor="gray",
+        ),
+        showlegend=False  # Disable legend
+    )
+    st.plotly_chart(fig_no2, key="fig_no2")

pages/admin.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from sklearn.metrics import mean_squared_error
+from src.data_api_calls import get_combined_data
+USERNAME = "admin"
+PASSWORD = "password"
+st.title("Admin Panel")
+# Use session state to remember login state
+if "login_success" not in st.session_state:
+    st.session_state.login_success = False
+# Login Form
+if not st.session_state.login_success:
+    with st.form("login_form"):
+        st.write("Please login to access the admin dashboard:")
+        username = st.text_input("Username")
+        password = st.text_input("Password", type="password")
+        login_button = st.form_submit_button("Login")
+        if login_button:
+            if username == USERNAME and password == PASSWORD:
+                st.session_state.login_success = True
+                st.success("Login successful!")
+            else:
+                st.error("Invalid username or password.")
+else:
+    # Fetching the combined data
+    table_data = get_combined_data()
+    # Check for missing values
+    missing_values = table_data.isnull()
+    # Display the main data table
+    st.subheader("Data used for the prediction")
+    # Display message based on whether data is complete
+    if missing_values.values.any():
+        # Warning message if there are missing values
+        st.markdown(
+            "<h4 style='color: #E68B0A;'>Warning: Some data is missing!</h4>",
+            unsafe_allow_html=True,
+        )
+        # Identify columns with missing values
+        missing_columns = table_data.columns[missing_values.any()].tolist()
+        # Identify rows (dates) with missing values
+        missing_rows = table_data[missing_values.any(axis=1)]["Date"].tolist()
+        # Display additional information about missing columns and rows
+        if missing_columns:
+            st.markdown(f"**Columns with missing data:** {', '.join(missing_columns)}")
+        if missing_rows:
+            st.markdown(
+                f"**Rows with missing data (dates):** {', '.join(missing_rows)}"
+            )
+    else:
+        # Success message if no data is missing
+        st.markdown(
+            "<h4 style='color: #77C124;'>All data is complete!</h4>",
+            unsafe_allow_html=True,
+        )
+    st.dataframe(table_data)
+    # Actual data vs 1,2,3 days ahead predictions
+    actual_data = pd.read_csv("pollution_data.csv")
+    prediction_data = pd.read_csv("predictions_history.csv")
+    col1, col2 = st.columns(2)
+    with col1:
+        pollutant = st.radio("Select a pollutant", ("O3", "NO2"))
+    with col2:
+        days_ahead = st.radio("Select days ahead for prediction", (1, 2, 3))
+    predictions = prediction_data[prediction_data["pollutant"] == pollutant]
+    actual = actual_data[["date", pollutant]].rename(
+        columns={pollutant: "actual_value"}
+    )
+    predictions_filtered = predictions[
+        predictions["date_predicted"]
+        == (
+            pd.to_datetime(predictions["date"]) - pd.Timedelta(days=days_ahead)
+        ).dt.strftime("%Y-%m-%d")
+    ]
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=actual["date"],
+            y=actual["actual_value"],
+            mode="lines+markers",
+            name="Ground Truth",
+            line=dict(color="green", width=3),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=predictions_filtered["date"],
+            y=predictions_filtered["prediction_value"],
+            mode="lines+markers",
+            name=f"Prediction {days_ahead} day(s) ahead",
+            line=dict(dash="dash", color="orange", width=3),
+        )
+    )
+    fig.update_layout(
+        title=f"{pollutant} Predictions vs Actual Values",
+        xaxis_title="Date",
+        yaxis_title=f"{pollutant} Concentration",
+        legend=dict(x=0, y=1),
+        yaxis=dict(range=[0, 60]),
+        template="plotly_white",
+        xaxis=dict(
+            title="Date",
+            type="date",
+            tickmode="array",
+            tickvals=predictions["date"],
+            tickformat="%d-%b",
+            tickangle=-45,
+            tickcolor="gray",
+        ),
+    )
+    st.plotly_chart(fig)
+    # Evaluation Function
+    def evaluate_predictions_all_days(actual, predictions):
+        rmse_values_all = {"O3": [], "NO2": []}
+        smape_values_all = {"O3": [], "NO2": []}
+        for pollutant in ["O3", "NO2"]:
+            predictions_pollutant = predictions[predictions["pollutant"] == pollutant]
+            actual_pollutant = actual_data[["date", pollutant]].rename(
+                columns={pollutant: "actual_value"}
+            )
+            # Calculate RMSE and SMAPE for each day (1st, 2nd, and 3rd)
+            for i in range(1, 4):
+                predictions_filtered = predictions_pollutant[
+                    predictions_pollutant["date_predicted"]
+                    == (
+                        pd.to_datetime(predictions_pollutant["date"])
+                        - pd.Timedelta(days=i)
+                    ).dt.strftime("%Y-%m-%d")
+                ]
+                actual_filtered = actual_pollutant[
+                    actual_pollutant["date"].isin(predictions_filtered["date"])
+                ]
+                merged = pd.merge(
+                    actual_filtered,
+                    predictions_filtered,
+                    left_on="date",
+                    right_on="date",
+                )
+                if not merged.empty:
+                    actual_values = merged["actual_value"].values
+                    prediction_values = merged["prediction_value"].values
+                    rmse = np.sqrt(mean_squared_error(actual_values, prediction_values))
+                    rmse_values_all[pollutant].append(rmse)
+                    smape = (
+                        100
+                        / len(actual_values)
+                        * np.sum(
+                            2
+                            * np.abs(prediction_values - actual_values)
+                            / (np.abs(actual_values) + np.abs(prediction_values))
+                        )
+                    )
+                    smape_values_all[pollutant].append(smape)
+        # Plot RMSE and SMAPE for both pollutants
+        fig_rmse = go.Figure()
+        for day in range(3):
+            fig_rmse.add_trace(
+                go.Bar(
+                    x=["O3", "NO2"],
+                    y=[rmse_values_all["O3"][day], rmse_values_all["NO2"][day]],
+                    name=f"Day {day + 1}",
+                )
+            )
+        fig_rmse.update_layout(
+            title="RMSE for Predictions Over 3 Days",
+            yaxis_title="RMSE",
+            xaxis_title="Pollutant",
+            barmode="group",
+        )
+        st.plotly_chart(fig_rmse)
+        fig_smape = go.Figure()
+        for day in range(3):
+            fig_smape.add_trace(
+                go.Bar(
+                    x=["O3", "NO2"],
+                    y=[smape_values_all["O3"][day], smape_values_all["NO2"][day]],
+                    name=f"Day {day + 1}",
+                )
+            )
+        fig_smape.update_layout(
+            title="SMAPE for Predictions Over 3 Days",
+            yaxis_title="SMAPE (%)",
+            xaxis_title="Pollutant",
+            barmode="group",
+        )
+        st.plotly_chart(fig_smape)
+        # Calculate total current SMAPE and RMSE
+        total_O3_smape = sum(smape_values_all["O3"]) / len(smape_values_all)
+        total_NO2_smape = sum(smape_values_all["NO2"]) / len(smape_values_all)
+        total_O3_rmse = sum(rmse_values_all["O3"]) / len(rmse_values_all)
+        total_NO2_rmse = sum(rmse_values_all["NO2"]) / len(rmse_values_all)
+        # Display metrics table
+        metrics_data = {
+            "Metric": [
+                "Current NO2 SMAPE (%)",
+                "Current NO2 RMSE (µg/m3)",
+                "Current O3 SMAPE (%)",
+                "Current O3 RMSE (µg/m3)",
+            ],
+            "Value": [total_NO2_smape, total_NO2_rmse, total_O3_smape, total_O3_rmse],
+        }
+        metrics_df = pd.DataFrame(metrics_data)
+        st.table(metrics_df)
+    evaluate_predictions_all_days(actual_data, prediction_data)

past_pollution_data.csv ADDED Viewed

	@@ -0,0 +1,18 @@

+date,NO2,O3
+2023-10-18,10.8427027027027,39.81260000000001
+2023-10-19,17.97026666666666,31.779024390243908
+2023-10-20,17.233055555555563,18.7156
+2023-10-21,15.023599999999991,22.04
+2023-10-22,8.723378378378372,48.33439999999999
+2023-10-23,20.63426666666668,15.586000000000002
+2023-10-24,15.1156,24.62808510638297
+2023-10-25,22.88567567567568,27.117599999999992
+2023-10-26,21.53175675675676,13.3216
+2023-10-27,23.07226666666666,16.15416666666666
+2023-10-28,24.89121621621622,24.59040816326531
+2023-10-29,9.724428571428573,51.525200000000005
+2023-10-30,11.20205479452055,52.820600000000006
+2023-10-31,17.494666666666667,44.458541666666655
+2023-11-01,21.588095238095235,29.20631578947369
+2023-11-02,9.745714285714286,48.39760869565216
+2023-11-03,7.163243243243242,61.421599999999984

past_weather_data.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
+2023-10-17,8.5,84.8,0.0,22.3,1019.3,34.8,75.2
+2023-10-18,9.0,77.9,2.3,25.9,1006.0,23.8,71.2
+2023-10-19,14.5,94.0,11.4,22.3,990.8,21.2,39.8
+2023-10-20,11.9,97.4,20.4,25.9,981.0,10.4,7.0
+2023-10-21,13.1,88.0,3.5,22.3,989.4,27.7,39.9
+2023-10-22,12.1,87.3,3.9,25.9,1003.6,32.3,55.9
+2023-10-23,9.9,95.7,0.5,18.0,1011.1,5.9,43.8
+2023-10-24,11.6,92.3,6.5,22.3,1001.3,23.1,32.6
+2023-10-25,9.3,96.8,15.3,18.0,996.8,15.7,14.5
+2023-10-26,9.4,97.6,0.1,11.2,995.6,4.8,36.0
+2023-10-27,10.6,97.9,11.4,14.8,992.0,9.5,20.5
+2023-10-28,11.4,88.6,3.0,18.4,994.4,29.3,48.5
+2023-10-29,13.0,82.2,9.5,31.7,991.5,38.8,35.4
+2023-10-30,11.2,90.4,13.0,18.4,997.5,28.8,27.0
+2023-10-31,11.0,93.7,18.6,18.0,1000.7,17.9,29.8
+2023-11-01,12.4,88.5,4.9,25.9,997.8,32.6,31.5
+2023-11-02,11.0,80.0,8.7,46.4,976.4,33.6,21.5
+2023-11-03,9.6,83.3,7.9,32.4,981.6,31,40.1

pollution_data.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+date,NO2,O3
+2024-10-17,22.804605103280675,22.769159859976643
+2024-10-18,23.26858769887009,23.30733245729302
+2024-10-19,23.91006441223834,23.1717142857143
+2024-10-20,22.57323754789273,23.53784452296821
+2024-10-21,21.1457004830918,24.02069565217393
+2024-10-22,21.77657980456027,23.33588571428572
+2024-10-23,21.974793814433,22.21468879668051
+2024-10-24,25.51256756756757,20.91370967741937
+2024-10-25,21.72051282051282,22.33230769230769
+2024-10-26,24.46423484380123,18.70331123489324
+2024-10-27,27.53722134983982,20.80809239842384
+2024-10-28,23.337567567567568,26.82861788617886
+2024-10-29,16.53533209586906,23.28254887605004
+2024-10-30,22.26162162162162,18.03443548387097
+2024-10-31,24.919333333333334,20.79696

predictions_history.csv ADDED Viewed

	@@ -0,0 +1,78 @@

+pollutant,date_predicted,date,prediction_value
+O3,2024-10-14,2024-10-17,31.25335185244893
+NO2,2024-10-14,2024-10-17,26.421736787446267
+O3,2024-10-15,2024-10-17,22.00005767760448
+NO2,2024-10-15,2024-10-17,28.59511317503212
+O3,2024-10-16,2024-10-17,9.657466070999735
+NO2,2024-10-16,2024-10-17,17.065168790519902
+O3,2024-10-15,2024-10-18,6.561248
+NO2,2024-10-15,2024-10-18,26.443672
+O3,2024-10-16,2024-10-18,19.782418
+NO2,2024-10-16,2024-10-18,36.453956
+O3,2024-10-17,2024-10-18,16.08841798553393
+NO2,2024-10-17,2024-10-18,32.0458143607889
+O3,2024-10-16,2024-10-19,24.031357603260783
+NO2,2024-10-16,2024-10-19,20.08389395558791
+O3,2024-10-17,2024-10-19,21.031357603260783
+NO2,2024-10-17,2024-10-19,27.08389395558791
+O3,2024-10-17,2024-10-20,20.48486247979324
+NO2,2024-10-17,2024-10-20,23.84300578029378
+O3,2024-10-18,2024-10-19,22.304547122637445
+NO2,2024-10-18,2024-10-19,20.80017116560889
+O3,2024-10-18,2024-10-20,31.25335185244893
+NO2,2024-10-18,2024-10-20,29.732316066240585
+O3,2024-10-18,2024-10-21,28.67755196805434
+NO2,2024-10-18,2024-10-21,35.04638743773354
+O3,2024-10-19,2024-10-20,26.421736787446267
+NO2,2024-10-19,2024-10-20,27.399885723190767
+O3,2024-10-19,2024-10-21,17.065168790519902
+NO2,2024-10-19,2024-10-21,18.992352714813563
+O3,2024-10-19,2024-10-22,17.39682962048955
+NO2,2024-10-19,2024-10-22,22.85061675885908
+O3,2024-10-20,2024-10-21,22.00005767760448
+NO2,2024-10-20,2024-10-21,18.27191592927812
+O3,2024-10-20,2024-10-22,29.00940466937953
+NO2,2024-10-20,2024-10-22,19.50739766963497
+O3,2024-10-20,2024-10-23,20.062134354543343
+NO2,2024-10-20,2024-10-23,23.65746607099973
+O3,2024-10-21,2024-10-22,17.497382318189132
+NO2,2024-10-21,2024-10-22,28.59511317503212
+O3,2024-10-21,2024-10-23,16.519952190354232
+NO2,2024-10-21,2024-10-23,30.192389708351826
+O3,2024-10-21,2024-10-24,28.19940385112904
+NO2,2024-10-21,2024-10-24,17.9525039623211
+O3,2024-10-22,2024-10-23,16.093074246425157
+NO2,2024-10-22,2024-10-23,25.217639978187005
+O3,2024-10-22,2024-10-24,23.605545201596552
+NO2,2024-10-22,2024-10-24,29.004701753536988
+O3,2024-10-23,2024-10-24,26.56486295059828
+NO2,2024-10-23,2024-10-24,20.15373733747257
+O3,2024-10-24,2024-10-25,10.33808859423279
+NO2,2024-10-24,2024-10-25,25.68519991558237
+O3,2024-10-24,2024-10-26,16.000984317626852
+NO2,2024-10-24,2024-10-26,25.760307451092384
+O3,2024-10-24,2024-10-27,19.64377495640328
+NO2,2024-10-24,2024-10-27,31.210576791105115
+O3,2024-10-25,2024-10-26,20.48055947200643
+NO2,2024-10-25,2024-10-26,23.95723903986424
+O3,2024-10-25,2024-10-27,11.088152958498888
+NO2,2024-10-25,2024-10-27,32.274494671100506
+O3,2024-10-25,2024-10-28,-0.7175631399505704
+NO2,2024-10-25,2024-10-28,40.86107800019054
+O3,2024-10-28,2024-10-29,22.13652238154496
+NO2,2024-10-28,2024-10-29,31.608886931951144
+O3,2024-10-28,2024-10-30,15.841669224
+NO2,2024-10-28,2024-10-30,34.564284711452984
+O3,2024-10-28,2024-10-31,22.35944571003375
+NO2,2024-10-28,2024-10-31,34.37482132111927
+O3,2024-10-30,2024-10-31,15.98046542733637
+NO2,2024-10-30,2024-10-31,29.77507241979599
+O3,2024-10-30,2024-11-01,21.135906183680472
+NO2,2024-10-30,2024-11-01,28.38872595850704
+O3,2024-10-30,2024-11-02,19.67426015042635
+O3,2024-10-31,2024-11-01,16.491393851863755
+NO2,2024-10-31,2024-11-01,17.22825222459993
+O3,2024-10-31,2024-11-02,16.874728806873033
+NO2,2024-10-31,2024-11-02,14.771381333796965
+O3,2024-10-31,2024-11-03,15.244292496093546
+NO2,2024-10-31,2024-11-03,14.606430068166452

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit
+pandas
+numpy
+joblib  # or pickle if you're using that to load the model
+scikit-learn # for mock model
+altair
+matplotlib
+plotly
+http.client
+datetime
+huggingface-hub
+python-dotenv
+torch
+safetensors

src/data_api_calls.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import codecs
+import csv
+import http.client
+import os
+import re
+import sys
+import urllib.request
+from datetime import date, timedelta
+from io import StringIO
+import pandas as pd
+WEATHER_DATA_FILE = "weather_data.csv"
+POLLUTION_DATA_FILE = "pollution_data.csv"
+def update_weather_data():
+    today = date.today().isoformat()
+    if os.path.exists(WEATHER_DATA_FILE):
+        df = pd.read_csv(WEATHER_DATA_FILE)
+        last_date = pd.to_datetime(df["date"]).max()
+        start_date = (last_date + timedelta(1)).isoformat()
+    else:
+        df = pd.DataFrame()
+        start_date = (date.today() - timedelta(7)).isoformat()
+    try:
+        ResultBytes = urllib.request.urlopen(
+            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
+        )
+        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
+        new_data = pd.DataFrame(list(CSVText))
+        new_data.columns = new_data.iloc[0]
+        new_data = new_data[1:]
+        new_data = new_data.rename(columns={"datetime": "date"})
+        updated_df = pd.concat([df, new_data], ignore_index=True)
+        updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
+        updated_df.to_csv(WEATHER_DATA_FILE, index=False)
+    except urllib.error.HTTPError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+    except urllib.error.URLError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+def update_pollution_data():
+    O3 = []
+    NO2 = []
+    particles = ["NO2", "O3"]
+    stations = ["NL10636", "NL10639", "NL10643"]
+    all_dataframes = []
+    today = date.today().isoformat() + "T09:00:00Z"
+    yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
+    if os.path.exists(POLLUTION_DATA_FILE):
+        existing_data = pd.read_csv(POLLUTION_DATA_FILE)
+        last_date = pd.to_datetime(existing_data["date"]).max()
+        if last_date >= pd.Timestamp(date.today()):
+            print("Data is already up to date.")
+            return
+    # Only pull data for today if not already updated
+    for particle in particles:
+        for station in stations:
+            conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+            payload = ""
+            headers = {}
+            conn.request(
+                "GET",
+                f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                payload,
+                headers,
+            )
+            res = conn.getresponse()
+            data = res.read()
+            decoded_data = data.decode("utf-8")
+            df = pd.read_csv(StringIO(decoded_data))
+            df = df.filter(like="value")
+            all_dataframes.append(df)
+        combined_data = pd.concat(all_dataframes, ignore_index=True)
+        values = []
+        for row in combined_data:
+            cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
+            if cleaned_value:
+                values.append(float(cleaned_value[0]))
+        if values:
+            avg = sum(values) / len(values)
+            if particle == "NO2":
+                NO2.append(avg)
+            else:
+                O3.append(avg)
+    new_data = pd.DataFrame(
+        {
+            "date": [date.today()],
+            "NO2": NO2,
+            "O3": O3,
+        }
+    )
+    updated_data = pd.concat([existing_data, new_data], ignore_index=True)
+    updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
+    updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
+def get_combined_data():
+    weather_df = pd.read_csv(WEATHER_DATA_FILE)
+    today = pd.Timestamp.now().normalize()
+    seven_days_ago = today - pd.Timedelta(days=7)
+    weather_df["date"] = pd.to_datetime(weather_df["date"])
+    weather_df = weather_df[(weather_df["date"] >= seven_days_ago) & (weather_df["date"] <= today)]
+    weather_df.insert(1, "NO2", None)
+    weather_df.insert(2, "O3", None)
+    weather_df.insert(10, "weekday", None)
+    columns = list(weather_df.columns)
+    columns.insert(3, columns.pop(6))
+    weather_df = weather_df[columns]
+    columns.insert(5, columns.pop(9))
+    weather_df = weather_df[columns]
+    columns.insert(9, columns.pop(6))
+    weather_df = weather_df[columns]
+    combined_df = weather_df
+    # Apply scaling and renaming similar to the scale function from previous code
+    combined_df = combined_df.rename(
+        columns={
+            "date": "date",
+            "windspeed": "wind_speed",
+            "temp": "mean_temp",
+            "solarradiation": "global_radiation",
+            "precip": "percipitation",
+            "sealevelpressure": "pressure",
+            "visibility": "minimum_visibility",
+        }
+    )
+    combined_df["date"] = pd.to_datetime(combined_df["date"])
+    combined_df["weekday"] = combined_df["date"].dt.day_name()
+    combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
+    combined_df["mean_temp"] = combined_df["mean_temp"] * 10
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
+    combined_df["percipitation"] = combined_df["percipitation"] * 10
+    combined_df["pressure"] = combined_df["pressure"] * 10
+    combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
+    combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
+    combined_df["percipitation"] = combined_df["percipitation"].astype(int)
+    combined_df["pressure"] = combined_df["pressure"].astype(int)
+    combined_df["humidity"] = combined_df["humidity"].astype(int)
+    combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
+    pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
+    pollution_df["date"] = pd.to_datetime(pollution_df["date"])
+    pollution_df = pollution_df[(pollution_df["date"] >= seven_days_ago) & (pollution_df["date"] <= today)]
+    combined_df["NO2"] = pollution_df["NO2"]
+    combined_df["O3"] = pollution_df["O3"]
+    return combined_df

src/features_pipeline.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import warnings
+import joblib
+import numpy as np
+import pandas as pd
+from dotenv import load_dotenv
+from huggingface_hub import hf_hub_download, login
+from src.past_data_api_calls import get_past_combined_data
+warnings.filterwarnings("ignore")
+load_dotenv()
+login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
+def create_features(
+    data,
+    target_particle,  # Added this parameter
+    lag_days=7,
+    sma_days=7,
+):
+    lag_features = [
+        "NO2",
+        "O3",
+        "wind_speed",
+        "mean_temp",
+        "global_radiation",
+        "minimum_visibility",
+        "humidity",
+    ]
+    if target_particle == "NO2":
+        lag_features = lag_features + ["percipitation", "pressure"]
+    if target_particle not in ["O3", "NO2"]:
+        raise ValueError("target_particle must be 'O3' or 'NO2'")
+    data = data.copy()
+    data["date"] = pd.to_datetime(data["date"])
+    data = data.sort_values("date").reset_index(drop=True)
+    # Extract 'weekday' and 'month' from 'date' if not present
+    if "weekday" not in data.columns or data["weekday"].dtype == object:
+        data["weekday"] = data["date"].dt.weekday  # Monday=0, Sunday=6
+    if "month" not in data.columns:
+        data["month"] = data["date"].dt.month  # 1 to 12
+    # Create sine and cosine transformations for 'weekday' and 'month'
+    data["weekday_sin"] = np.sin(2 * np.pi * data["weekday"] / 7)
+    data["weekday_cos"] = np.cos(2 * np.pi * data["weekday"] / 7)
+    data["month_sin"] = np.sin(2 * np.pi * (data["month"] - 1) / 12)
+    data["month_cos"] = np.cos(2 * np.pi * (data["month"] - 1) / 12)
+    # Create lagged features for the specified lag days
+    for feature in lag_features:
+        for lag in range(1, lag_days + 1):
+            data[f"{feature}_lag_{lag}"] = data[feature].shift(lag)
+    # Create SMA features
+    for feature in lag_features:
+        data[f"{feature}_sma_{sma_days}"] = (
+            data[feature].rolling(window=sma_days).mean()
+        )
+    # Create particle data (NO2 and O3) from the same time last year
+    past_data = get_past_combined_data()
+    # Today last year
+    data["O3_last_year"] = past_data["O3"].iloc[-4]
+    data["NO2_last_year"] = past_data["NO2"].iloc[-4]
+    # 7 days before today last year
+    for i in range(1, lag_days + 1):
+        data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i - 1]
+        data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i - 1]
+    # 3 days after today last year
+    data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1]
+    data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1]
+    # Drop missing values
+    rows_before = data.shape[0]
+    data = data.dropna().reset_index(drop=True)
+    rows_after = data.shape[0]
+    rows_dropped = rows_before - rows_after
+    print(f"Number of rows with missing values dropped: {rows_dropped}/{rows_before}")
+    print(data)
+    # Ensure the data is sorted by date in ascending order
+    data = data.sort_values("date").reset_index(drop=True)
+    # Define feature columns
+    exclude_cols = ["date", "weekday", "month"]
+    feature_cols = [col for col in data.columns if col not in exclude_cols]
+    # Split features and targets
+    x = data[feature_cols]
+    # Scale
+    repo_id = f"elisaklunder/Utrecht-{target_particle}-Forecasting-Model"
+    file_name = f"feature_scaler_{target_particle}.joblib"
+    path = hf_hub_download(repo_id=repo_id, filename=file_name)
+    feature_scaler = joblib.load(path)
+    X_scaled = feature_scaler.transform(x)
+    # Convert scaled data back to DataFrame for consistency
+    X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=x.index)
+    return X_scaled

src/helper_functions.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+# Custom function to create styled metric boxes with compact layout
+def custom_metric_box(label, value):
+    st.markdown(f"""
+        <div style="
+            padding: 5px;
+            margin-bottom: 5px;
+            width: 100%;  /* Full width */
+            display: flex;
+            flex-direction: column;  /* Align items vertically */
+            align-items: flex-start;  /* Align all content to the left */
+        ">
+            <div>
+                <h4 style="font-size: 14px; font-weight: normal; margin: 0;">{label}</h4>  <!-- Smaller label -->
+            </div>
+            <div>
+                <p style="font-size: 18px; font-weight: bold; margin: 0;">{value}</p>  <!-- Smaller metric -->
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+# Custom function to create pollution metric boxes with side-by-side layout for label and value
+# Custom function to create pollution metric boxes with side-by-side layout and fixed width
+def pollution_box(label, value, delta, threshold):
+    # Determine if the pollution level is "Good" or "Bad"
+    status = "Good" if float(value.split()[0]) < threshold else "Bad"
+    status_color = "#77C124" if status == "Good" else "#E68B0A"
+    # Render the pollution box
+    st.markdown(f"""
+        <div style="
+            background: rgba(255, 255, 255, 0.05);
+            border-radius: 16px;
+            box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
+            backdrop-filter: blur(5px);
+            -webkit-backdrop-filter: blur(5px);
+            border: 1px solid rgba(255, 255, 255, 0.15);
+            padding: 15px;
+            margin-bottom: 10px;
+        ">
+            <h4 style="font-size: 24px; font-weight: bold; margin: 0;">{label}</h4>  <!-- Bigger label -->
+            <p style="font-size: 36px; font-weight: bold; color: {status_color}; margin: 0;">{status}</p>  <!-- Good/Bad with color -->
+            <p style="font-size: 18px; margin: 0;">{value}</p>  <!-- Smaller value where delta used to be -->
+        </div>
+    """, unsafe_allow_html=True)

src/past_data_api_calls.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import codecs
+import csv
+import http.client
+import os
+import re
+import sys
+import urllib.request
+from datetime import date, timedelta
+from io import StringIO
+import pandas as pd
+PAST_WEATHER_DATA_FILE = "past_weather_data.csv"
+PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
+def update_past_weather_data():
+    last_year_date = date.today() - timedelta(days=365)
+    if os.path.exists(PAST_WEATHER_DATA_FILE):
+        df = pd.read_csv(PAST_WEATHER_DATA_FILE)
+        start_date = pd.to_datetime(df["date"]).max().date().isoformat()
+        end_date = (last_year_date + timedelta(days=2)).isoformat()
+    else:
+        df = pd.DataFrame()
+        start_date = (last_year_date - timedelta(days=8)).isoformat()
+        end_date = (last_year_date + timedelta(days=2)).isoformat()
+    try:
+        ResultBytes = urllib.request.urlopen(
+            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
+        )
+        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
+        data = pd.DataFrame(list(CSVText))
+        data.columns = data.iloc[0]
+        data = data[1:]
+        data = data.rename(columns={"datetime": "date"})
+        updated_df = pd.concat([df, data], ignore_index=True)
+        updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
+        updated_df.to_csv(PAST_WEATHER_DATA_FILE, index=False)
+    except urllib.error.HTTPError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+    except urllib.error.URLError as e:
+        ErrorInfo = e.read().decode()
+        print("Error code: ", e.code, ErrorInfo)
+        sys.exit()
+def update_past_pollution_data():
+    O3 = []
+    NO2 = []
+    particles = ["NO2", "O3"]
+    stations = ["NL10636", "NL10639", "NL10643"]
+    all_dataframes = []
+    last_year_date = date.today() - timedelta(days=365)
+    if os.path.exists(PAST_POLLUTION_DATA_FILE):
+        existing_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
+        last_date = pd.to_datetime(existing_data["date"]).max()
+        if last_date >= pd.to_datetime(last_year_date):
+            print("Data is already up to date.")
+            return
+        else:
+            start_date = last_date.date()
+            end_date = last_year_date + timedelta(days=3)
+    else:
+        existing_data = pd.DataFrame()
+        start_date = last_year_date - timedelta(days=7)
+        end_date = last_year_date + timedelta(days=3)
+    date_list = [
+        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
+    ]
+    for current_date in date_list:
+        today = current_date.isoformat() + "T09:00:00Z"
+        yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
+        for particle in particles:
+            all_dataframes = []  # Reset for each particle
+            for station in stations:
+                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+                payload = ""
+                headers = {}
+                conn.request(
+                    "GET",
+                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                    payload,
+                    headers,
+                )
+                res = conn.getresponse()
+                data = res.read()
+                decoded_data = data.decode("utf-8")
+                df = pd.read_csv(StringIO(decoded_data))
+                df = df.filter(like="value")
+                all_dataframes.append(df)
+            combined_data = pd.concat(all_dataframes, ignore_index=True)
+            values = []
+            for row in combined_data:
+                cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
+                if cleaned_value:
+                    values.append(float(cleaned_value[0]))
+            if values:
+                avg = sum(values) / len(values)
+                if particle == "NO2":
+                    NO2.append(avg)
+                else:
+                    O3.append(avg)
+    new_data = pd.DataFrame(
+        {
+            "date": date_list,
+            "NO2": NO2,
+            "O3": O3,
+        }
+    )
+    updated_data = pd.concat([existing_data, new_data], ignore_index=True)
+    updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
+    updated_data.to_csv(PAST_POLLUTION_DATA_FILE, index=False)
+    return NO2, O3
+def get_past_combined_data():
+    update_past_weather_data()
+    update_past_pollution_data()
+    combined_df = pd.read_csv(PAST_WEATHER_DATA_FILE)
+    pollution_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
+    combined_df = combined_df.merge(pollution_data, on="date", how="inner")
+    combined_df = combined_df.tail(11)
+    # Apply scaling and renaming similar to the scale function from previous code
+    combined_df = combined_df.rename(
+        columns={
+            "date": "date",
+            "windspeed": "wind_speed",
+            "temp": "mean_temp",
+            "solarradiation": "global_radiation",
+            "precip": "percipitation",
+            "sealevelpressure": "pressure",
+            "visibility": "minimum_visibility",
+        }
+    )
+    combined_df["date"] = pd.to_datetime(combined_df["date"])
+    combined_df["weekday"] = combined_df["date"].dt.day_name()
+    combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
+    combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
+    combined_df["percipitation"] = combined_df["percipitation"].astype(float)
+    combined_df["pressure"] = combined_df["pressure"].astype(float).round()
+    combined_df["humidity"] = combined_df["humidity"].astype(float).round()
+    combined_df["global_radiation"] = combined_df["global_radiation"].astype(float)
+    combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
+    combined_df["mean_temp"] = combined_df["mean_temp"] * 10
+    combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
+    combined_df["percipitation"] = combined_df["percipitation"] * 10
+    combined_df["pressure"] = combined_df["pressure"] * 10
+    combined_df["wind_speed"] = (
+        combined_df["wind_speed"].astype(float).round().astype(int)
+    )
+    combined_df["mean_temp"] = (
+        combined_df["mean_temp"].astype(float).round().astype(int)
+    )
+    combined_df["minimum_visibility"] = (
+        combined_df["minimum_visibility"].astype(float).round().astype(int)
+    )
+    combined_df["percipitation"] = (
+        combined_df["percipitation"].astype(float).round().astype(int)
+    )
+    combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
+    combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
+    combined_df["global_radiation"] = (
+        combined_df["global_radiation"].astype(float).round().astype(int)
+    )
+    return combined_df

src/predict.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+from datetime import date, datetime, timedelta
+import joblib
+import pandas as pd
+import torch
+from dotenv import load_dotenv
+from huggingface_hub import hf_hub_download, login
+from src.data_api_calls import (
+    get_combined_data,
+    update_pollution_data,
+    update_weather_data,
+)
+from src.features_pipeline import create_features
+load_dotenv()
+login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN"))
+def load_nn():
+    import torch.nn as nn
+    from huggingface_hub import PyTorchModelHubMixin
+    class AirPollutionNet(nn.Module, PyTorchModelHubMixin):
+        def __init__(self, input_size, layers, dropout_rate):
+            super(AirPollutionNet, self).__init__()
+            self.layers_list = nn.ModuleList()
+            in_features = input_size
+            for units in layers:
+                self.layers_list.append(nn.Linear(in_features, units))
+                self.layers_list.append(nn.ReLU())
+                self.layers_list.append(nn.Dropout(p=dropout_rate))
+                in_features = units
+            self.output = nn.Linear(in_features, 3)  # Output size is 3 for next 3 days
+        def forward(self, x):
+            for layer in self.layers_list:
+                x = layer(x)
+            x = self.output(x)
+            return x
+    model = AirPollutionNet.from_pretrained(
+        "akseljoonas/Utrecht_pollution_forecasting_NO2"
+    )
+    return model
+def load_model(particle):
+    repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
+    if particle == "O3":
+        file_name = "O3_svr_model.pkl"
+        model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
+        model = joblib.load(model_path)
+    else:
+        model = load_nn()
+    return model
+def run_model(particle, data):
+    input_data = create_features(data=data, target_particle=particle)
+    model = load_model(particle)
+    if particle == "NO2":
+        with torch.no_grad():
+            prediction = model(torch.tensor(input_data.values, dtype=torch.float32))
+        repo_id = "akseljoonas/Utrecht_pollution_forecasting_NO2"
+        file_name = "target_scaler_NO2.joblib"
+        path = hf_hub_download(repo_id=repo_id, filename=file_name)
+    else:
+        prediction = model.predict(input_data)
+        repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model"
+        file_name = f"target_scaler_{particle}.joblib"
+        path = hf_hub_download(repo_id=repo_id, filename=file_name)
+    target_scaler = joblib.load(path)
+    prediction = target_scaler.inverse_transform(prediction)
+    return prediction
+def update_data_and_predictions():
+    update_weather_data()
+    update_pollution_data()
+    week_data = get_combined_data()
+    o3_predictions = run_model("O3", data=week_data)
+    no2_predictions = run_model("NO2", data=week_data)
+    prediction_data = []
+    for i in range(3):
+        prediction_data.append(
+            {
+                "pollutant": "O3",
+                "date_predicted": date.today(),
+                "date": date.today() + timedelta(days=i + 1),
+                "prediction_value": o3_predictions[0][i],
+            }
+        )
+        prediction_data.append(
+            {
+                "pollutant": "NO2",
+                "date_predicted": date.today(),
+                "date": date.today() + timedelta(days=i + 1),
+                "prediction_value": no2_predictions[0][i],
+            }
+        )
+    predictions_df = pd.DataFrame(prediction_data)
+    PREDICTIONS_FILE = "predictions_history.csv"
+    if os.path.exists(PREDICTIONS_FILE):
+        existing_data = pd.read_csv(PREDICTIONS_FILE)
+        # Filter out predictions made today to avoid duplicates
+        existing_data = existing_data[
+            ~(existing_data["date_predicted"] == str(date.today()))
+        ]
+        combined_data = pd.concat([existing_data, predictions_df])
+        combined_data.drop_duplicates()
+    else:
+        combined_data = predictions_df
+    combined_data.to_csv(PREDICTIONS_FILE, index=False)
+def get_data_and_predictions():
+    week_data = get_combined_data()
+    PREDICTIONS_FILE = "predictions_history.csv"
+    data = pd.read_csv(PREDICTIONS_FILE)
+    today = datetime.today().strftime("%Y-%m-%d")
+    today_predictions = data[(data["date_predicted"] == today)]
+    # Extract predictions for O3 and NO2
+    o3_predictions = today_predictions[today_predictions["pollutant"] == "O3"][
+        "prediction_value"
+    ].values
+    no2_predictions = today_predictions[today_predictions["pollutant"] == "NO2"][
+        "prediction_value"
+    ].values
+    return week_data, [o3_predictions], [no2_predictions]
+if __name__=="__main__":
+    update_data_and_predictions()

weather_data.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
+2024-10-17,16.9,86.0,0.6,18.4,1010.0,37.1,43.0
+2024-10-18,15.5,97.3,3.9,7.6,1014.0,4.5,42.9
+2024-10-19,14.7,89.9,1.6,14.8,1014.1,22.8,43.5
+2024-10-20,15.5,83.8,0.5,29.5,1016.0,41.5,0.0
+2024-10-21,14.4,92.7,4.3,21.2,1020.6,22.0,27.8
+2024-10-22,11.4,92.8,4.9,19.4,1026.9,22.6,57.0
+2024-10-23,11.2,97.3,0.0,13.0,1032.8,6.5,12.5
+2024-10-24,10.4,94.0,0.0,20.5,1024.7,13.0,62.5
+2024-10-25,13.6,92.2,0.5,11.9,1016.8,24.0,93.0
+2024-10-26,13.7,91.5,0.0,11.9,1016.3,23.3,8.0
+2024-10-27,13.2,87.1,0.1,20.5,1019.4,10.4,28.6
+2024-10-28,12.4,91.8,1.1,31.7,1021.8,12.8,27.3
+2024-10-29,13.8,95.9,0.2,20.5,1023.1,8.1,16.0
+2024-10-30,12.7,92.9,0.6,9.4,1027.5,12.5,32.8
+2024-10-31,12.5,89.9,0.0,11.2,1027.1,17.1,70.6