Mihkelmj commited on
Commit
386e426
·
1 Parent(s): 2f8050c

created last year features; past_data_api_call.py; modified data_loading.py

Browse files
__pycache__/data_api_calls.cpython-312.pyc CHANGED
Binary files a/__pycache__/data_api_calls.cpython-312.pyc and b/__pycache__/data_api_calls.cpython-312.pyc differ
 
dataset.csv CHANGED
@@ -5,5 +5,5 @@ date,NO2,O3,wind_speed,mean_temp,global_radiation,percipitation,pressure,minimum
5
  2024-10-19,24.532038834951457,23.604722719141325,43,147,43,28,10140,236,92,Saturday
6
  2024-10-20,23.019101941747575,24.173377192982453,68,145,0,0,10160,241,82,Sunday
7
  2024-10-21,21.275629139072848,25.05873563218391,58,144,27,43,10206,220,92,Monday
8
- 2024-10-22,22.334374999999998,24.5942194092827,76,123,60,12,10265,100,87,Tuesday
9
- 2024-10-23,24.261733333333336,23.56,31,115,7,0,10328,112,95,Wednesday
 
5
  2024-10-19,24.532038834951457,23.604722719141325,43,147,43,28,10140,236,92,Saturday
6
  2024-10-20,23.019101941747575,24.173377192982453,68,145,0,0,10160,241,82,Sunday
7
  2024-10-21,21.275629139072848,25.05873563218391,58,144,27,43,10206,220,92,Monday
8
+ 2024-10-22,22.334374999999998,24.5942194092827,76,123,57,12,10265,100,87,Tuesday
9
+ 2024-10-23,24.261733333333336,23.56,31,115,7,0,10328,105,95,Wednesday
past_data_api_calls.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import csv
3
+ import http.client
4
+ import os
5
+ import re
6
+ import sys
7
+ import urllib.request
8
+ from datetime import date, timedelta
9
+ from io import StringIO
10
+
11
+ import pandas as pd
12
+
13
+
14
+ def pollution_data():
15
+ particles = ["NO2", "O3"]
16
+ stations = ["NL10636", "NL10639", "NL10643"]
17
+ last_year_date = date.today() - timedelta(days=365)
18
+ start_date = last_year_date - timedelta(days=7)
19
+ end_date = last_year_date + timedelta(days=3)
20
+ date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
21
+ for current_date in date_list:
22
+ today = current_date.isoformat() + "T09:00:00Z"
23
+ yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
24
+ for particle in particles:
25
+ all_dataframes = [] # Reset for each particle
26
+ for station in stations:
27
+ conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
28
+ payload = ''
29
+ headers = {}
30
+ conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
31
+ res = conn.getresponse()
32
+ data = res.read()
33
+ decoded_data = data.decode("utf-8")
34
+ df = pd.read_csv(StringIO(decoded_data))
35
+ df = df.filter(like='value')
36
+ all_dataframes.append(df)
37
+ if all_dataframes:
38
+ combined_data = pd.concat(all_dataframes, ignore_index=True)
39
+ combined_data.to_csv(f'{particle}_{today}.csv', index=False)
40
+
41
+ def delete_csv(csvs):
42
+ for csv_file in csvs:
43
+ if(os.path.exists(csv_file) and os.path.isfile(csv_file)):
44
+ os.remove(csv_file)
45
+
46
+ def clean_values():
47
+ particles = ["NO2", "O3"]
48
+ csvs = []
49
+ NO2 = []
50
+ O3 = []
51
+ last_year_date = date.today() - timedelta(days=365)
52
+ start_date = last_year_date - timedelta(days=7)
53
+ end_date = last_year_date + timedelta(days=3)
54
+ date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
55
+ for current_date in date_list:
56
+ today = current_date.isoformat() + "T09:00:00Z"
57
+ for particle in particles:
58
+ name = f'{particle}_{today}.csv'
59
+ csvs.append(name)
60
+ for csv_file in csvs:
61
+ if not os.path.exists(csv_file):
62
+ continue # Skip if the file doesn't exist
63
+ values = [] # Reset values for each CSV file
64
+ # Open the CSV file and read the values
65
+ with open(csv_file, 'r') as file:
66
+ reader = csv.reader(file)
67
+ for row in reader:
68
+ for value in row:
69
+ # Use regular expressions to extract numeric part
70
+ cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
71
+ if cleaned_value: # If we successfully extract a number
72
+ values.append(float(cleaned_value[0])) # Convert the first match to float
73
+
74
+ # Compute the average if the values list is not empty
75
+ if values:
76
+ avg = sum(values) / len(values)
77
+ if "NO2" in csv_file:
78
+ NO2.append(avg)
79
+ else:
80
+ O3.append(avg)
81
+ delete_csv(csvs)
82
+ return NO2, O3
83
+
84
+ def add_columns():
85
+ file_path = 'weather_data.csv'
86
+ df = pd.read_csv(file_path)
87
+
88
+ df.insert(1, 'NO2', None)
89
+ df.insert(2, 'O3', None)
90
+ df.insert(10, 'weekday', None)
91
+
92
+ return df
93
+
94
+ def scale(data):
95
+ df = data
96
+ columns = list(df.columns)
97
+
98
+ columns.insert(3, columns.pop(6))
99
+ df = df[columns]
100
+
101
+ columns.insert(5, columns.pop(9))
102
+ df = df[columns]
103
+
104
+ columns.insert(9, columns.pop(6))
105
+ df = df[columns]
106
+
107
+ df = df.rename(columns={
108
+ 'datetime':'date',
109
+ 'windspeed': 'wind_speed',
110
+ 'temp': 'mean_temp',
111
+ 'solarradiation':'global_radiation',
112
+ 'precip':'percipitation',
113
+ 'sealevelpressure':'pressure',
114
+ 'visibility':'minimum_visibility'
115
+ })
116
+
117
+ df['date'] = pd.to_datetime(df['date'])
118
+ df['weekday'] = df['date'].dt.day_name()
119
+
120
+ df = df.sort_values(by='date').reset_index(drop=True)
121
+
122
+ df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
123
+ df['mean_temp'] = df['mean_temp'] * 10
124
+ df['minimum_visibility'] = df['minimum_visibility'] * 10
125
+ df['percipitation'] = df['percipitation'] * 10
126
+ df['pressure'] = df['pressure']
127
+
128
+ df['wind_speed'] = df['wind_speed'].astype(int)
129
+ df['mean_temp'] = df['mean_temp'].astype(int)
130
+ df['minimum_visibility'] = df['minimum_visibility'].astype(int)
131
+ df['percipitation'] = df['percipitation'].astype(int)
132
+ df['pressure'] = df['pressure'].astype(int)
133
+ df['humidity'] = df['humidity'].astype(int)
134
+ df['global_radiation'] = df['global_radiation'].astype(int)
135
+
136
+ return df
137
+
138
+ def insert_pollution(NO2, O3, data):
139
+ df = data
140
+ df['NO2'] = NO2
141
+ df['O3'] = O3
142
+ return df
143
+
144
+ def weather_data():
145
+ # Get last year's same day
146
+ last_year_date = date.today() - timedelta(days=365)
147
+ # Start date is 7 days prior
148
+ start_date = (last_year_date - timedelta(days=7)).isoformat()
149
+ # End date is 3 days ahead
150
+ end_date = (last_year_date + timedelta(days=3)).isoformat()
151
+ try:
152
+ ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
153
+
154
+ # Parse the results as CSV
155
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
156
+ # Saving the CSV content to a file
157
+ current_dir = os.path.dirname(os.path.realpath(__file__))
158
+ file_path = os.path.join(current_dir, 'weather_data.csv')
159
+ with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
160
+ csv_writer = csv.writer(csvfile)
161
+ csv_writer.writerows(CSVText)
162
+
163
+ except urllib.error.HTTPError as e:
164
+ ErrorInfo= e.read().decode()
165
+ print('Error code: ', e.code, ErrorInfo)
166
+ sys.exit()
167
+ except urllib.error.URLError as e:
168
+ ErrorInfo= e.read().decode()
169
+ print('Error code: ', e.code,ErrorInfo)
170
+ sys.exit()
171
+
172
+ def weather_data():
173
+ # Set up dates for last year: 7 days before today last year, and 3 days ahead of this day last year
174
+ today_last_year = date.today() - timedelta(365)
175
+ start_last_year = today_last_year - timedelta(8)
176
+ end_last_year = today_last_year + timedelta(2)
177
+
178
+ try:
179
+ # API call with new date range for last year
180
+ ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_last_year}/{end_last_year}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
181
+
182
+ # Parse the results as CSV
183
+ CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
184
+ # Saving the CSV content to a file
185
+ current_dir = os.path.dirname(os.path.realpath(__file__))
186
+ file_path = os.path.join(current_dir, 'weather_data.csv')
187
+ with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
188
+ csv_writer = csv.writer(csvfile)
189
+ csv_writer.writerows(CSVText)
190
+
191
+ except urllib.error.HTTPError as e:
192
+ ErrorInfo = e.read().decode()
193
+ print('Error code: ', e.code, ErrorInfo)
194
+ sys.exit()
195
+ except urllib.error.URLError as e:
196
+ ErrorInfo = e.read().decode()
197
+ print('Error code: ', e.code, ErrorInfo)
198
+ sys.exit()
199
+
200
+
201
+ def get_past_data():
202
+ weather_data()
203
+ pollution_data()
204
+ NO2, O3 = clean_values()
205
+ df = add_columns()
206
+ scaled_df = scale(df)
207
+ output_df = insert_pollution(NO2, O3, scaled_df)
208
+ os.remove('weather_data.csv')
209
+ return output_df
src/data_loading.py CHANGED
@@ -1,6 +1,9 @@
 
1
  import numpy as np
2
  import pandas as pd
3
- import joblib
 
 
4
 
5
  def create_features(
6
  data,
@@ -83,19 +86,21 @@ def create_features(
83
  data[feature].rolling(window=sma_days).mean()
84
  )
85
 
 
86
  # Create particle data (NO2 and O3) from the same time last year
87
  # Today last year
88
- data["O3_last_year"] = 0 # data["O3_last_year"] = data["O3"].shift(365)
89
- data["NO2_last_year"] = 0 # data["NO2_last_year"] = data["NO2"].shift(365)
 
90
 
91
  # 7 days before today last year
92
- for i in range(1, lag_days + 1):
93
- data[f"O3_last_year_{i}_days_before"] = 0 # data["O3"].shift(365 + i)
94
- data[f"NO2_last_year_{i}_days_before"] = 0 # data["NO2"].shift(365 + i)
95
 
96
  # 3 days after today last year
97
- data["O3_last_year_3_days_after"] = 0 # data["O3"].shift(365 - 3)
98
- data["NO2_last_year_3_days_after"] = 0 # data["NO2"].shift(365 - 3)
99
 
100
  # Calculate the number of rows before dropping missing values
101
  rows_before = data.shape[0]
 
1
+ import joblib
2
  import numpy as np
3
  import pandas as pd
4
+
5
+ from past_data_api_calls import get_past_data
6
+
7
 
8
  def create_features(
9
  data,
 
86
  data[feature].rolling(window=sma_days).mean()
87
  )
88
 
89
+ past_data = get_past_data()
90
  # Create particle data (NO2 and O3) from the same time last year
91
  # Today last year
92
+
93
+ data["O3_last_year"] = past_data["O3"].iloc[-4] # data["O3_last_year"] = data["O3"].shift(365)
94
+ data["NO2_last_year"] = past_data["NO2"].iloc[-4] # data["NO2_last_year"] = data["NO2"].shift(365)
95
 
96
  # 7 days before today last year
97
+ for i in range(1, lag_days+1):
98
+ data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i-1] # data["O3"].shift(365 + i)
99
+ data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i-1] # data["NO2"].shift(365 + i)
100
 
101
  # 3 days after today last year
102
+ data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1] # data["O3"].shift(365 - 3)
103
+ data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1] # data["NO2"].shift(365 - 3)
104
 
105
  # Calculate the number of rows before dropping missing values
106
  rows_before = data.shape[0]