Mihkelmj commited on
Commit
eeaf86d
·
1 Parent(s): ca76ce0

recreating the feature creation function; need to get data from previous eyars

Browse files
__pycache__/data_api_calls.cpython-312.pyc ADDED
Binary file (10.7 kB). View file
 
__pycache__/data_loading.cpython-312.pyc ADDED
Binary file (8.88 kB). View file
 
app.py CHANGED
@@ -8,6 +8,7 @@ from sklearn.linear_model import LinearRegression
8
  import matplotlib.pyplot as plt
9
  import plotly.graph_objects as go
10
  from helper_functions import custom_metric_box, pollution_box, run_model
 
11
 
12
  st.set_page_config(
13
  page_title="Utrecht Pollution Dashboard",
@@ -18,6 +19,9 @@ st.set_page_config(
18
  alt.themes.enable("dark")
19
 
20
  prediction = run_model() # Assuming you have a function run_model()
 
 
 
21
 
22
  # App Title
23
  st.title("Utrecht Pollution Dashboard 🌱")
 
8
  import matplotlib.pyplot as plt
9
  import plotly.graph_objects as go
10
  from helper_functions import custom_metric_box, pollution_box, run_model
11
+ from data_api_calls import get_data
12
 
13
  st.set_page_config(
14
  page_title="Utrecht Pollution Dashboard",
 
19
  alt.themes.enable("dark")
20
 
21
  prediction = run_model() # Assuming you have a function run_model()
22
+ get_data()
23
+
24
+ data = pd.read_csv("dataset.csv")
25
 
26
  # App Title
27
  st.title("Utrecht Pollution Dashboard 🌱")
daily_api__pollution.py → data_api_calls.py RENAMED
@@ -5,8 +5,11 @@ from io import StringIO
5
  import os
6
  import re
7
  import csv
 
 
 
8
 
9
- def api_call():
10
  particles = ["NO2", "O3"]
11
  stations = ["NL10636", "NL10639", "NL10643"]
12
  all_dataframes = []
@@ -150,12 +153,39 @@ def insert_pollution(NO2, O3):
150
  while O3:
151
  df.loc[start_index, 'O3'] = O3.pop()
152
  start_index += 1
153
- df.to_csv('recorded_data.csv', index=False)
154
-
155
- api_call()
156
- NO2, O3 = clean_values()
157
- add_columns()
158
- scale()
159
- insert_pollution(NO2, O3)
160
- os.remove('combined_data.csv')
161
- os.remove('weather_data.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
  import re
7
  import csv
8
+ import urllib.request
9
+ import sys
10
+ import codecs
11
 
12
+ def pollution_data():
13
  particles = ["NO2", "O3"]
14
  stations = ["NL10636", "NL10639", "NL10643"]
15
  all_dataframes = []
 
153
  while O3:
154
  df.loc[start_index, 'O3'] = O3.pop()
155
  start_index += 1
156
+ df.to_csv('dataset.csv', index=False)
157
+
158
+ def weather_data():
159
+ today = date.today().isoformat()
160
+ seven_days = (date.today() - timedelta(6)).isoformat()
161
+ try:
162
+ ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{seven_days}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
163
+
164
+ # Parse the results as CSV
165
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
166
+ # Saving the CSV content to a file
167
+ current_dir = os.path.dirname(os.path.realpath(__file__))
168
+ file_path = os.path.join(current_dir, 'weather_data.csv')
169
+ with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
170
+ csv_writer = csv.writer(csvfile)
171
+ csv_writer.writerows(CSVText)
172
+
173
+ except urllib.error.HTTPError as e:
174
+ ErrorInfo= e.read().decode()
175
+ print('Error code: ', e.code, ErrorInfo)
176
+ sys.exit()
177
+ except urllib.error.URLError as e:
178
+ ErrorInfo= e.read().decode()
179
+ print('Error code: ', e.code,ErrorInfo)
180
+ sys.exit()
181
+
182
+
183
+ def get_data():
184
+ weather_data()
185
+ pollution_data()
186
+ NO2, O3 = clean_values()
187
+ add_columns()
188
+ scale()
189
+ insert_pollution(NO2, O3)
190
+ os.remove('combined_data.csv')
191
+ os.remove('weather_data.csv')
data_loading.py CHANGED
@@ -110,7 +110,6 @@ def create_features_and_targets(
110
  """
111
  import warnings
112
 
113
- import joblib
114
  import numpy as np
115
  import pandas as pd
116
  from sklearn.preprocessing import StandardScaler
@@ -175,10 +174,6 @@ def create_features_and_targets(
175
  data["O3_last_year_3_days_after"] = data["O3"].shift(365 - 3)
176
  data["NO2_last_year_3_days_after"] = data["NO2"].shift(365 - 3)
177
 
178
- # Create targets only for the specified particle for the next 'days_ahead' days
179
- for day in range(1, days_ahead + 1):
180
- data[f"{target_particle}_plus_{day}_day"] = data[target_particle].shift(-day)
181
-
182
  # Calculate the number of rows before dropping missing values
183
  rows_before = data.shape[0]
184
 
@@ -192,85 +187,26 @@ def create_features_and_targets(
192
  rows_dropped = rows_before - rows_after
193
  print(f"Number of rows with missing values dropped: {rows_dropped}")
194
 
195
- # Now, split data into train, validation, and test sets using the most recent dates
196
- total_days = data.shape[0]
197
- test_size = 365
198
- val_size = 365
199
-
200
- if total_days < test_size + val_size:
201
- raise ValueError(
202
- "Not enough data to create validation and test sets of 365 days each."
203
- )
204
-
205
  # Ensure the data is sorted by date in ascending order
206
  data = data.sort_values("date").reset_index(drop=True)
207
 
208
- # Split data
209
- train_data = data.iloc[: -(val_size + test_size)]
210
- val_data = data.iloc[-(val_size + test_size) : -test_size]
211
- test_data = data.iloc[-test_size:]
212
-
213
- # Define target columns for the specified particle
214
- target_cols = [
215
- f"{target_particle}_plus_{day}_day" for day in range(1, days_ahead + 1)
216
- ]
217
-
218
  # Define feature columns
219
- exclude_cols = ["date", "weekday", "month"] + target_cols
220
  feature_cols = [col for col in data.columns if col not in exclude_cols]
221
 
222
  # Split features and targets
223
- X_train = train_data[feature_cols]
224
- y_train = train_data[target_cols]
225
 
226
- X_val = val_data[feature_cols]
227
- y_val = val_data[target_cols]
228
-
229
- X_test = test_data[feature_cols]
230
- y_test = test_data[target_cols]
231
 
232
  # Initialize scalers
233
  feature_scaler = StandardScaler()
234
- target_scaler = StandardScaler()
235
 
236
  # Fit the scalers on the training data
237
- X_train_scaled = feature_scaler.fit_transform(X_train)
238
- y_train_scaled = target_scaler.fit_transform(y_train)
239
-
240
- # Apply the scalers to validation and test data
241
- X_val_scaled = feature_scaler.transform(X_val)
242
- y_val_scaled = target_scaler.transform(y_val)
243
-
244
- X_test_scaled = feature_scaler.transform(X_test)
245
- y_test_scaled = target_scaler.transform(y_test)
246
 
247
  # Convert scaled data back to DataFrame for consistency
248
- X_train_scaled = pd.DataFrame(
249
- X_train_scaled, columns=feature_cols, index=X_train.index
250
- )
251
- y_train_scaled = pd.DataFrame(
252
- y_train_scaled, columns=target_cols, index=y_train.index
253
  )
254
 
255
- X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_cols, index=X_val.index)
256
- y_val_scaled = pd.DataFrame(y_val_scaled, columns=target_cols, index=y_val.index)
257
-
258
- X_test_scaled = pd.DataFrame(
259
- X_test_scaled, columns=feature_cols, index=X_test.index
260
- )
261
- y_test_scaled = pd.DataFrame(y_test_scaled, columns=target_cols, index=y_test.index)
262
-
263
- # Save the scalers to files
264
- joblib.dump(feature_scaler, "feature_scaler.joblib")
265
- # Save the target scaler with the particle name to distinguish
266
- target_scaler_filename = f"target_scaler_{target_particle}.joblib"
267
- joblib.dump(target_scaler, target_scaler_filename)
268
-
269
- return (
270
- X_train_scaled,
271
- y_train_scaled,
272
- X_val_scaled,
273
- y_val_scaled,
274
- X_test_scaled,
275
- y_test_scaled,
276
- )
 
110
  """
111
  import warnings
112
 
 
113
  import numpy as np
114
  import pandas as pd
115
  from sklearn.preprocessing import StandardScaler
 
174
  data["O3_last_year_3_days_after"] = data["O3"].shift(365 - 3)
175
  data["NO2_last_year_3_days_after"] = data["NO2"].shift(365 - 3)
176
 
 
 
 
 
177
  # Calculate the number of rows before dropping missing values
178
  rows_before = data.shape[0]
179
 
 
187
  rows_dropped = rows_before - rows_after
188
  print(f"Number of rows with missing values dropped: {rows_dropped}")
189
 
 
 
 
 
 
 
 
 
 
 
190
  # Ensure the data is sorted by date in ascending order
191
  data = data.sort_values("date").reset_index(drop=True)
192
 
 
 
 
 
 
 
 
 
 
 
193
  # Define feature columns
194
+ exclude_cols = ["date", "weekday", "month"]
195
  feature_cols = [col for col in data.columns if col not in exclude_cols]
196
 
197
  # Split features and targets
198
+ x = data[feature_cols]
 
199
 
 
 
 
 
 
200
 
201
  # Initialize scalers
202
  feature_scaler = StandardScaler()
 
203
 
204
  # Fit the scalers on the training data
205
+ X_scaled = feature_scaler.fit_transform(x)
 
 
 
 
 
 
 
 
206
 
207
  # Convert scaled data back to DataFrame for consistency
208
+ X_scaled = pd.DataFrame(
209
+ X_scaled, columns=feature_cols, index=x.index
 
 
 
210
  )
211
 
212
+ return X_scaled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ date,NO2,O3,wind_speed,mean_temp,global_radiation,percipitation,pressure,minimum_visibility,humidity,weekday
2
+ 2024-10-16,22.4144459833795,22.78109803921569,61,151,40,0,10103,358,82,Wednesday
3
+ 2024-10-17,22.990465489566613,22.928154311649017,51,169,43,6,10100,371,86,Thursday
4
+ 2024-10-18,23.659013539651834,23.700536672629696,21,156,42,39,10140,64,97,Friday
5
+ 2024-10-19,24.727853658536585,23.52574561403509,43,147,43,28,10140,236,92,Saturday
6
+ 2024-10-20,22.700366666666664,24.317572254335257,68,145,0,0,10160,241,82,Sunday
7
+ 2024-10-21,19.763439153439155,25.661659574468086,66,142,27,39,10201,110,90,Monday
8
+ 2024-10-22,20.281666666666666,25.787520661157025,76,120,54,97,10266,128,87,Tuesday
python.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from data_api_calls import get_data
2
+
3
+ get_data()
requirements.txt CHANGED
@@ -5,4 +5,6 @@ joblib # or pickle if you're using that to load the model
5
  scikit-learn # for mock model
6
  altair
7
  matplotlib
8
- plotly
 
 
 
5
  scikit-learn # for mock model
6
  altair
7
  matplotlib
8
+ plotly
9
+ http.client
10
+ datetime
test.ipynb ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from data_loading import create_features_and_targets\n",
10
+ "from data_api_calls import get_data\n",
11
+ "import pandas as pd"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 8,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "dataset = pd.read_csv(\"dataset.csv\")"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 11,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "Number of rows with missing values dropped: 7\n"
33
+ ]
34
+ },
35
+ {
36
+ "ename": "ValueError",
37
+ "evalue": "Found array with 0 sample(s) (shape=(0, 92)) while a minimum of 1 is required by StandardScaler.",
38
+ "output_type": "error",
39
+ "traceback": [
40
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
41
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
42
+ "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_features_and_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_particle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mNO2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mlag_days\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m6\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43msma_days\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m6\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mdays_ahead\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\n",
43
+ "File \u001b[0;32m~/Desktop/utrecht-pollution-prediction/data_loading.py:214\u001b[0m, in \u001b[0;36mcreate_features_and_targets\u001b[0;34m(data, target_particle, lag_days, sma_days, days_ahead)\u001b[0m\n\u001b[1;32m 211\u001b[0m target_scaler \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m 213\u001b[0m \u001b[38;5;66;03m# Fit the scalers on the training data\u001b[39;00m\n\u001b[0;32m--> 214\u001b[0m X_scaled \u001b[38;5;241m=\u001b[39m \u001b[43mfeature_scaler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 215\u001b[0m y_scaled \u001b[38;5;241m=\u001b[39m target_scaler\u001b[38;5;241m.\u001b[39mfit_transform(y)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;66;03m# Convert scaled data back to DataFrame for consistency\u001b[39;00m\n",
44
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/utils/_set_output.py:313\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[0;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 313\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 315\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[1;32m 316\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 317\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[1;32m 318\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[1;32m 319\u001b[0m )\n",
45
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 1084\u001b[0m (\n\u001b[1;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[1;32m 1094\u001b[0m )\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[0;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtransform(X)\n\u001b[1;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[1;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n",
46
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/preprocessing/_data.py:878\u001b[0m, in \u001b[0;36mStandardScaler.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[38;5;66;03m# Reset internal state before fitting\u001b[39;00m\n\u001b[1;32m 877\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()\n\u001b[0;32m--> 878\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpartial_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n",
47
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1471\u001b[0m )\n\u001b[1;32m 1472\u001b[0m ):\n\u001b[0;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
48
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/preprocessing/_data.py:914\u001b[0m, in \u001b[0;36mStandardScaler.partial_fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 882\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Online computation of mean and std on X for later scaling.\u001b[39;00m\n\u001b[1;32m 883\u001b[0m \n\u001b[1;32m 884\u001b[0m \u001b[38;5;124;03mAll of X is processed as a single batch. This is intended for cases\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[38;5;124;03m Fitted scaler.\u001b[39;00m\n\u001b[1;32m 912\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 913\u001b[0m first_call \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples_seen_\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 914\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 915\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 916\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 917\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mFLOAT_DTYPES\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 918\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mallow-nan\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 919\u001b[0m \u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 921\u001b[0m n_features \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 923\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
49
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/base.py:633\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 631\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m no_val_y:\n\u001b[0;32m--> 633\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mX\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 634\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_y:\n\u001b[1;32m 635\u001b[0m out \u001b[38;5;241m=\u001b[39m _check_y(y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_params)\n",
50
+ "File \u001b[0;32m~/anaconda3/envs/ml-industry/lib/python3.12/site-packages/sklearn/utils/validation.py:1087\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 1085\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(array)\n\u001b[1;32m 1086\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_samples \u001b[38;5;241m<\u001b[39m ensure_min_samples:\n\u001b[0;32m-> 1087\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1088\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound array with \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m sample(s) (shape=\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m) while a\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1089\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m minimum of \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m is required\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1090\u001b[0m \u001b[38;5;241m%\u001b[39m (n_samples, array\u001b[38;5;241m.\u001b[39mshape, ensure_min_samples, context)\n\u001b[1;32m 1091\u001b[0m )\n\u001b[1;32m 1093\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ensure_min_features \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m array\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[1;32m 1094\u001b[0m n_features \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n",
51
+ "\u001b[0;31mValueError\u001b[0m: Found array with 0 sample(s) (shape=(0, 92)) while a minimum of 1 is required by StandardScaler."
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "test_data = create_features_and_targets(\n",
57
+ " data=dataset,\n",
58
+ " target_particle=\"NO2\",\n",
59
+ " lag_days=7,\n",
60
+ " sma_days=7,\n",
61
+ " days_ahead=3,\n",
62
+ ")"
63
+ ]
64
+ }
65
+ ],
66
+ "metadata": {
67
+ "kernelspec": {
68
+ "display_name": "ml-industry",
69
+ "language": "python",
70
+ "name": "python3"
71
+ },
72
+ "language_info": {
73
+ "codemirror_mode": {
74
+ "name": "ipython",
75
+ "version": 3
76
+ },
77
+ "file_extension": ".py",
78
+ "mimetype": "text/x-python",
79
+ "name": "python",
80
+ "nbconvert_exporter": "python",
81
+ "pygments_lexer": "ipython3",
82
+ "version": "3.12.5"
83
+ }
84
+ },
85
+ "nbformat": 4,
86
+ "nbformat_minor": 2
87
+ }
test.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_loading import create_features_and_targets
2
+ from data_api_calls import get_data
3
+ import pandas as pd
4
+
5
+ dataset = pd.read_csv("dataset.csv")
6
+
7
+ X, y = create_features_and_targets(
8
+ data=dataset,
9
+ target_particle="NO2",
10
+ lag_days=7,
11
+ sma_days=7,
12
+ days_ahead=3,
13
+ )