Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

Mihkelmj commited on Oct 23, 2024

Commit

5064f83

1 Parent(s): 9aa7aec

app.py gets the data and runs the model; last year features to be implemented

Browse files

Files changed (10) hide show

__pycache__/data_api_calls.cpython-312.pyc +0 -0
app.py +19 -15
dataset.csv +1 -1
requirements.txt +1 -0
scalers/feature_scaler_NO2.joblib +3 -0
scalers/feature_scaler_O3.joblib +3 -0
src/data_loading.py +4 -81
src/models_loading.py +5 -7
test.ipynb +67 -97
test.py +4 -10

__pycache__/data_api_calls.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/data_api_calls.cpython-312.pyc and b/__pycache__/data_api_calls.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import altair as alt
 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
 from src.helper_functions import custom_metric_box, pollution_box
 from src.models_loading import run_model
-from data_api_calls import get_data
 st.set_page_config(
     page_title="Utrecht Pollution Dashboard",
@@ -15,10 +16,12 @@ st.set_page_config(
 alt.themes.enable("dark")
-test_predictions = run_model("O3")
 get_data()
-data = pd.read_csv("dataset.csv")
 # App Title
 st.title("Utrecht Pollution Dashboard🌱")
@@ -54,23 +57,24 @@ with col1:
         pollution_box(label="NO<sub>2</sub>", value="28 µg/m³", delta="+3 µg/m³")
 # Sample data (replace with your actual data)
-dates_past = pd.date_range(end=pd.Timestamp.today(), periods=7).to_list()
-dates_future = pd.date_range(
-    start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3
-).to_list()
 # O3 and NO2 values for the past 7 days
-o3_past_values = [30, 32, 34, 33, 31, 35, 36]
-no2_past_values = [20, 22, 21, 23, 22, 24, 25]
-# Predicted O3 and NO2 values for the next 3 days
-o3_future_values = [37, 38, 40]
-no2_future_values = [26, 27, 28]
 # Combine dates and values
 dates = dates_past + dates_future
-o3_values = o3_past_values + o3_future_values
-no2_values = no2_past_values + no2_future_values
 # Create a DataFrame
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})

 import pandas as pd
 import plotly.graph_objects as go
 import streamlit as st
+from data_api_calls import get_data
 from src.helper_functions import custom_metric_box, pollution_box
 from src.models_loading import run_model
 st.set_page_config(
     page_title="Utrecht Pollution Dashboard",
 alt.themes.enable("dark")
 get_data()
+dataset = pd.read_csv("dataset.csv")
+prediction = run_model("O3", data=dataset)
+pred1 = prediction[0][0]
+pred2 = prediction[0][1]
+pred3 = prediction[0][2]
 # App Title
 st.title("Utrecht Pollution Dashboard🌱")
         pollution_box(label="NO<sub>2</sub>", value="28 µg/m³", delta="+3 µg/m³")
 # Sample data (replace with your actual data)
+# Sample data (replace with your actual data)
+dates_past = pd.date_range(end=pd.Timestamp.today(), periods=8).to_list()
+dates_future = pd.date_range(start=pd.Timestamp.today() + pd.Timedelta(days=1), periods=3).to_list()
 # O3 and NO2 values for the past 7 days
+o3_past_values = dataset["O3"]
+no2_past_values = dataset["NO2"]
+# Predicted O3 and NO2 values for the next 3 days (convert to pandas Series)
+o3_future_values = pd.Series(prediction[0].flatten())  # Flatten the array to 1D
+no2_future_values = pd.Series([26, 27, 28])  # Example prediction data
+# Combine the past and future values using pd.concat
+o3_values = pd.concat([o3_past_values, o3_future_values], ignore_index=True)
+no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
 # Combine dates and values
 dates = dates_past + dates_future
 # Create a DataFrame
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})

dataset.csv CHANGED Viewed

@@ -6,4 +6,4 @@ date,NO2,O3,wind_speed,mean_temp,global_radiation,percipitation,pressure,minimum
 2024-10-19,24.727853658536585,23.52574561403509,43,147,43,28,10140,236,92,Saturday
 2024-10-20,22.700366666666664,24.317572254335257,68,145,0,0,10160,241,82,Sunday
 2024-10-21,19.763439153439155,25.661659574468086,66,142,27,39,10201,110,90,Monday
-2024-10-22,20.281666666666666,25.787520661157025,76,121,54,97,10266,116,87,Tuesday

 2024-10-19,24.727853658536585,23.52574561403509,43,147,43,28,10140,236,92,Saturday
 2024-10-20,22.700366666666664,24.317572254335257,68,145,0,0,10160,241,82,Sunday
 2024-10-21,19.763439153439155,25.661659574468086,66,142,27,39,10201,110,90,Monday
+2024-10-22,20.281666666666666,25.787520661157025,76,121,54,97,10265,110,86,Tuesday

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ plotly
 http.client
 datetime
 huggingface-hub

 http.client
 datetime
 huggingface-hub
+python-dotenv

scalers/feature_scaler_NO2.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:005a752194f98e66653af7e3b3461c788fe9a902fb14e1b526aea7ea07201c48
+size 1487

scalers/feature_scaler_O3.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:389fb707d241a8df5c7a228e4aa3ca1ebf434a0c551bdbd66f000cf2e5850fb1
+size 1375

src/data_loading.py CHANGED Viewed

@@ -1,89 +1,12 @@
 import numpy as np
 import pandas as pd
-def create_lag_features_for_single_day(data, random_index, lag_days):
-    lag_features = [
-        column
-        for column in data.columns
-        if column
-        in [
-            "O3",
-            "NO2",
-            "wind_speed",
-            "mean_temp",
-            "global_radiation",
-            "percipitation",
-            "pressure",
-            "minimum_visibility",
-            "humidity",
-        ]
-    ]
-    lagged_data = {}
-    for feature in lag_features:
-        for lag in range(1, lag_days + 1):
-            try:
-                lagged_value = data.loc[random_index - lag, feature]
-                lagged_data[f"{feature}_lag_{lag}"] = lagged_value
-            except IndexError:
-                print(
-                    f"Value not found for feature {feature} lagged by {lag} from day {random_index}"
-                )
-                continue
-    # Add together lagged features, non-lagged features and date
-    current_data = data.iloc[random_index].to_dict()
-    current_data.update(lagged_data)
-    return pd.DataFrame([current_data])
-def create_targets_for_single_day(data, random_index, target_column, days_ahead):
-    targets = {}
-    for day in range(1, days_ahead + 1):
-        future_index = random_index + day
-        try:
-            targets[f"{target_column}_{day}_days_ahead"] = data.loc[
-                future_index, target_column
-            ]
-        except IndexError:
-            print(
-                f"Value not found for particle {target_column} forwarded by {day} day"
-            )
-    return pd.DataFrame([targets])
-def load_data_batch(data, target_particle, lag_days):
-    data["date"] = pd.to_datetime(data["date"])
-    # Exclude period with missing O3 data + buffer before and after for targets and lag features
-    start_exclusion = pd.to_datetime("2022-01-01") - pd.Timedelta(days=3)
-    end_exclusion = pd.to_datetime("2022-04-27") + pd.Timedelta(days=lag_days)
-    valid_data = data[
-        ~((data["date"] >= start_exclusion) & (data["date"] <= end_exclusion))
-    ]
-    valid_data = valid_data[
-        lag_days:-3
-    ]  # also exclude first seven and last three days of the dataset
-    # Get random day in the valid data
-    random_index = np.random.choice(valid_data.index, 1)[0]
-    # Create lag features for the selected day
-    train_data = create_lag_features_for_single_day(data, random_index, lag_days)
-    targets = create_targets_for_single_day(
-        data, random_index, target_particle, days_ahead=3
-    )
-    return train_data, targets
-def create_features_and_targets(
     data,
     target_particle,  # Added this parameter
     lag_days=7,
     sma_days=7,
-    days_ahead=3,
 ):
     """
     Creates lagged features, SMA features, last year's particle data (NO2 and O3) for specific days,
@@ -199,7 +122,7 @@ def create_features_and_targets(
     # Initialize scalers
-    feature_scaler = StandardScaler()
     # Fit the scalers on the training data
     X_scaled = feature_scaler.fit_transform(x)
@@ -209,4 +132,4 @@ def create_features_and_targets(
         X_scaled, columns=feature_cols, index=x.index
     )
-    return x

 import numpy as np
 import pandas as pd
+import joblib
+def create_features(
     data,
     target_particle,  # Added this parameter
     lag_days=7,
     sma_days=7,
 ):
     """
     Creates lagged features, SMA features, last year's particle data (NO2 and O3) for specific days,
     # Initialize scalers
+    feature_scaler = joblib.load(f"scalers/feature_scaler_{target_particle}.joblib")
     # Fit the scalers on the training data
     X_scaled = feature_scaler.fit_transform(x)
         X_scaled, columns=feature_cols, index=x.index
     )
+    return X_scaled

src/models_loading.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pandas as pd
 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
 def load_model(particle):
     load_dotenv()
@@ -24,14 +24,12 @@ def load_model(particle):
 @st.cache_resource(ttl=6 * 300)  # Reruns every 6 hours
-def run_model(particle):
     model = load_model(particle)
-    # Static input values
-    input_data = pd.DataFrame(
-        {"Temperature": [20.0], "Wind Speed": [10.0], "Humidity": [50.0]}
-    )
     # Run the model with static input
     prediction = model.predict(input_data)
     return prediction

 import streamlit as st
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download, login
+from src.data_loading import create_features
 def load_model(particle):
     load_dotenv()
 @st.cache_resource(ttl=6 * 300)  # Reruns every 6 hours
+def run_model(particle, data):
+    input_data = create_features(data=data, target_particle=particle)
     model = load_model(particle)
     # Run the model with static input
     prediction = model.predict(input_data)
+    target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib")
+    prediction = target_scaler.inverse_transform(prediction)
     return prediction

test.ipynb CHANGED Viewed

@@ -45,115 +45,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>NO2</th>\n",
-       "      <th>O3</th>\n",
-       "      <th>wind_speed</th>\n",
-       "      <th>mean_temp</th>\n",
-       "      <th>global_radiation</th>\n",
-       "      <th>percipitation</th>\n",
-       "      <th>pressure</th>\n",
-       "      <th>minimum_visibility</th>\n",
-       "      <th>humidity</th>\n",
-       "      <th>weekday_sin</th>\n",
-       "      <th>...</th>\n",
-       "      <th>O3_last_year_4_days_before</th>\n",
-       "      <th>NO2_last_year_4_days_before</th>\n",
-       "      <th>O3_last_year_5_days_before</th>\n",
-       "      <th>NO2_last_year_5_days_before</th>\n",
-       "      <th>O3_last_year_6_days_before</th>\n",
-       "      <th>NO2_last_year_6_days_before</th>\n",
-       "      <th>O3_last_year_7_days_before</th>\n",
-       "      <th>NO2_last_year_7_days_before</th>\n",
-       "      <th>O3_last_year_3_days_after</th>\n",
-       "      <th>NO2_last_year_3_days_after</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>20.281667</td>\n",
-       "      <td>25.787521</td>\n",
-       "      <td>76</td>\n",
-       "      <td>121</td>\n",
-       "      <td>54</td>\n",
-       "      <td>97</td>\n",
-       "      <td>10266</td>\n",
-       "      <td>116</td>\n",
-       "      <td>87</td>\n",
-       "      <td>0.781831</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1 rows × 103 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "         NO2         O3  wind_speed  mean_temp  global_radiation  \\\n",
-       "0  20.281667  25.787521          76        121                54   \n",
-       "\n",
-       "   percipitation  pressure  minimum_visibility  humidity  weekday_sin  ...  \\\n",
-       "0             97     10266                 116        87     0.781831  ...   \n",
-       "\n",
-       "   O3_last_year_4_days_before  NO2_last_year_4_days_before  \\\n",
-       "0                           0                            0   \n",
-       "\n",
-       "   O3_last_year_5_days_before  NO2_last_year_5_days_before  \\\n",
-       "0                           0                            0   \n",
-       "\n",
-       "   O3_last_year_6_days_before  NO2_last_year_6_days_before  \\\n",
-       "0                           0                            0   \n",
-       "\n",
-       "   O3_last_year_7_days_before  NO2_last_year_7_days_before  \\\n",
-       "0                           0                            0   \n",
-       "\n",
-       "   O3_last_year_3_days_after  NO2_last_year_3_days_after  \n",
-       "0                          0                           0  \n",
-       "\n",
-       "[1 rows x 103 columns]"
       ]
      },
-     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "test_data"
    ]
   },
   {
@@ -162,6 +87,51 @@
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "Index(['NO2', 'O3', 'wind_speed', 'mean_temp', 'global_radiation',\n",
+       "       'percipitation', 'pressure', 'minimum_visibility', 'humidity',\n",
+       "       'weekday_sin',\n",
+       "       ...\n",
+       "       'O3_last_year_4_days_before', 'NO2_last_year_4_days_before',\n",
+       "       'O3_last_year_5_days_before', 'NO2_last_year_5_days_before',\n",
+       "       'O3_last_year_6_days_before', 'NO2_last_year_6_days_before',\n",
+       "       'O3_last_year_7_days_before', 'NO2_last_year_7_days_before',\n",
+       "       'O3_last_year_3_days_after', 'NO2_last_year_3_days_after'],\n",
+       "      dtype='object', length=103)"
       ]
      },
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "test_data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.models_loading import run_model"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-10-22 21:43:37.935 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
+      "2024-10-22 21:43:37.938 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
+      "2024-10-22 21:43:37.939 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
+      "2024-10-22 21:43:37.980 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
+      "2024-10-22 21:43:37.980 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of rows with missing values dropped: 7\n"
+     ]
+    },
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '../scalers/feature_scaler_O3.joblib'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m prediction \u001b[38;5;241m=\u001b[39m \u001b[43mrun_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mO3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/streamlit/runtime/caching/cache_utils.py:210\u001b[0m, in \u001b[0;36mCachedFunc.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    208\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mshow_spinner \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mshow_spinner, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    209\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m spinner(message, _cache\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[0;32m--> 210\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_or_create_cached_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    211\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    212\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_or_create_cached_value(args, kwargs)\n",
+      "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/streamlit/runtime/caching/cache_utils.py:235\u001b[0m, in \u001b[0;36mCachedFunc._get_or_create_cached_value\u001b[0;34m(self, func_args, func_kwargs)\u001b[0m\n\u001b[1;32m    233\u001b[0m     cached_result \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mread_result(value_key)\n\u001b[1;32m    234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handle_cache_hit(cached_result)\n\u001b[0;32m--> 235\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_handle_cache_miss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcache\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/streamlit/runtime/caching/cache_utils.py:292\u001b[0m, in \u001b[0;36mCachedFunc._handle_cache_miss\u001b[0;34m(self, cache, value_key, func_args, func_kwargs)\u001b[0m\n\u001b[1;32m    288\u001b[0m \u001b[38;5;66;03m# We acquired the lock before any other thread. Compute the value!\u001b[39;00m\n\u001b[1;32m    289\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mcached_message_replay_ctx\u001b[38;5;241m.\u001b[39mcalling_cached_function(\n\u001b[1;32m    290\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mfunc\n\u001b[1;32m    291\u001b[0m ):\n\u001b[0;32m--> 292\u001b[0m     computed_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_info\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfunc_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfunc_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    294\u001b[0m \u001b[38;5;66;03m# We've computed our value, and now we need to write it back to the cache\u001b[39;00m\n\u001b[1;32m    295\u001b[0m \u001b[38;5;66;03m# along with any \"replay messages\" that were generated during value computation.\u001b[39;00m\n\u001b[1;32m    296\u001b[0m messages \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mcached_message_replay_ctx\u001b[38;5;241m.\u001b[39m_most_recent_messages\n",
+      "File \u001b[0;32m~/Desktop/utrecht-pollution-prediction/src/models_loading.py:28\u001b[0m, in \u001b[0;36mrun_model\u001b[0;34m(particle, data)\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;129m@st\u001b[39m\u001b[38;5;241m.\u001b[39mcache_resource(ttl\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m6\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m300\u001b[39m)  \u001b[38;5;66;03m# Reruns every 6 hours\u001b[39;00m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun_model\u001b[39m(particle, data):\n\u001b[0;32m---> 28\u001b[0m     input_data \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_particle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparticle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     29\u001b[0m     model \u001b[38;5;241m=\u001b[39m load_model(particle)\n\u001b[1;32m     31\u001b[0m     \u001b[38;5;66;03m# Run the model with static input\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/utrecht-pollution-prediction/src/data_loading.py:125\u001b[0m, in \u001b[0;36mcreate_features\u001b[0;34m(data, target_particle, lag_days, sma_days)\u001b[0m\n\u001b[1;32m    121\u001b[0m x \u001b[38;5;241m=\u001b[39m data[feature_cols]\n\u001b[1;32m    124\u001b[0m \u001b[38;5;66;03m# Initialize scalers\u001b[39;00m\n\u001b[0;32m--> 125\u001b[0m feature_scaler \u001b[38;5;241m=\u001b[39m \u001b[43mjoblib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m../scalers/feature_scaler_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mtarget_particle\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.joblib\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    127\u001b[0m \u001b[38;5;66;03m# Fit the scalers on the training data\u001b[39;00m\n\u001b[1;32m    128\u001b[0m X_scaled \u001b[38;5;241m=\u001b[39m feature_scaler\u001b[38;5;241m.\u001b[39mfit_transform(x)\n",
+      "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/joblib/numpy_pickle.py:650\u001b[0m, in \u001b[0;36mload\u001b[0;34m(filename, mmap_mode)\u001b[0m\n\u001b[1;32m    648\u001b[0m         obj \u001b[38;5;241m=\u001b[39m _unpickle(fobj)\n\u001b[1;32m    649\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m    651\u001b[0m         \u001b[38;5;28;01mwith\u001b[39;00m _read_fileobject(f, filename, mmap_mode) \u001b[38;5;28;01mas\u001b[39;00m fobj:\n\u001b[1;32m    652\u001b[0m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(fobj, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    653\u001b[0m                 \u001b[38;5;66;03m# if the returned file object is a string, this means we\u001b[39;00m\n\u001b[1;32m    654\u001b[0m                 \u001b[38;5;66;03m# try to load a pickle file generated with an version of\u001b[39;00m\n\u001b[1;32m    655\u001b[0m                 \u001b[38;5;66;03m# Joblib so we load it with joblib compatibility function.\u001b[39;00m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../scalers/feature_scaler_O3.joblib'"
+     ]
+    }
+   ],
+   "source": [
+    "prediction = run_model(\"O3\", data=dataset)"
+   ]
   }
  ],
  "metadata": {

test.py CHANGED Viewed

@@ -1,13 +1,7 @@
-from data_loading import create_features_and_targets
-from data_api_calls import get_data
 import pandas as pd
 dataset = pd.read_csv("dataset.csv")
-X, y = create_features_and_targets(
-    data=dataset,
-    target_particle="NO2",
-    lag_days=7,
-    sma_days=7,
-    days_ahead=3,
-)

 import pandas as pd
+from src.models_loading import run_model
 dataset = pd.read_csv("dataset.csv")
+prediction = run_model("O3", data=dataset)
+print(type(prediction))
+print(prediction)