created last year features; past_data_api_call.py; modified data_loading.py
Browse files- __pycache__/data_api_calls.cpython-312.pyc +0 -0
- dataset.csv +2 -2
- past_data_api_calls.py +209 -0
- src/data_loading.py +13 -8
__pycache__/data_api_calls.cpython-312.pyc
CHANGED
Binary files a/__pycache__/data_api_calls.cpython-312.pyc and b/__pycache__/data_api_calls.cpython-312.pyc differ
|
|
dataset.csv
CHANGED
@@ -5,5 +5,5 @@ date,NO2,O3,wind_speed,mean_temp,global_radiation,percipitation,pressure,minimum
|
|
5 |
2024-10-19,24.532038834951457,23.604722719141325,43,147,43,28,10140,236,92,Saturday
|
6 |
2024-10-20,23.019101941747575,24.173377192982453,68,145,0,0,10160,241,82,Sunday
|
7 |
2024-10-21,21.275629139072848,25.05873563218391,58,144,27,43,10206,220,92,Monday
|
8 |
-
2024-10-22,22.334374999999998,24.5942194092827,76,123,
|
9 |
-
2024-10-23,24.261733333333336,23.56,31,115,7,0,10328,
|
|
|
5 |
2024-10-19,24.532038834951457,23.604722719141325,43,147,43,28,10140,236,92,Saturday
|
6 |
2024-10-20,23.019101941747575,24.173377192982453,68,145,0,0,10160,241,82,Sunday
|
7 |
2024-10-21,21.275629139072848,25.05873563218391,58,144,27,43,10206,220,92,Monday
|
8 |
+
2024-10-22,22.334374999999998,24.5942194092827,76,123,57,12,10265,100,87,Tuesday
|
9 |
+
2024-10-23,24.261733333333336,23.56,31,115,7,0,10328,105,95,Wednesday
|
past_data_api_calls.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import codecs
|
2 |
+
import csv
|
3 |
+
import http.client
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import sys
|
7 |
+
import urllib.request
|
8 |
+
from datetime import date, timedelta
|
9 |
+
from io import StringIO
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
|
13 |
+
|
14 |
+
def pollution_data():
|
15 |
+
particles = ["NO2", "O3"]
|
16 |
+
stations = ["NL10636", "NL10639", "NL10643"]
|
17 |
+
last_year_date = date.today() - timedelta(days=365)
|
18 |
+
start_date = last_year_date - timedelta(days=7)
|
19 |
+
end_date = last_year_date + timedelta(days=3)
|
20 |
+
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
|
21 |
+
for current_date in date_list:
|
22 |
+
today = current_date.isoformat() + "T09:00:00Z"
|
23 |
+
yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
|
24 |
+
for particle in particles:
|
25 |
+
all_dataframes = [] # Reset for each particle
|
26 |
+
for station in stations:
|
27 |
+
conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
|
28 |
+
payload = ''
|
29 |
+
headers = {}
|
30 |
+
conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers)
|
31 |
+
res = conn.getresponse()
|
32 |
+
data = res.read()
|
33 |
+
decoded_data = data.decode("utf-8")
|
34 |
+
df = pd.read_csv(StringIO(decoded_data))
|
35 |
+
df = df.filter(like='value')
|
36 |
+
all_dataframes.append(df)
|
37 |
+
if all_dataframes:
|
38 |
+
combined_data = pd.concat(all_dataframes, ignore_index=True)
|
39 |
+
combined_data.to_csv(f'{particle}_{today}.csv', index=False)
|
40 |
+
|
41 |
+
def delete_csv(csvs):
|
42 |
+
for csv_file in csvs:
|
43 |
+
if(os.path.exists(csv_file) and os.path.isfile(csv_file)):
|
44 |
+
os.remove(csv_file)
|
45 |
+
|
46 |
+
def clean_values():
|
47 |
+
particles = ["NO2", "O3"]
|
48 |
+
csvs = []
|
49 |
+
NO2 = []
|
50 |
+
O3 = []
|
51 |
+
last_year_date = date.today() - timedelta(days=365)
|
52 |
+
start_date = last_year_date - timedelta(days=7)
|
53 |
+
end_date = last_year_date + timedelta(days=3)
|
54 |
+
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
|
55 |
+
for current_date in date_list:
|
56 |
+
today = current_date.isoformat() + "T09:00:00Z"
|
57 |
+
for particle in particles:
|
58 |
+
name = f'{particle}_{today}.csv'
|
59 |
+
csvs.append(name)
|
60 |
+
for csv_file in csvs:
|
61 |
+
if not os.path.exists(csv_file):
|
62 |
+
continue # Skip if the file doesn't exist
|
63 |
+
values = [] # Reset values for each CSV file
|
64 |
+
# Open the CSV file and read the values
|
65 |
+
with open(csv_file, 'r') as file:
|
66 |
+
reader = csv.reader(file)
|
67 |
+
for row in reader:
|
68 |
+
for value in row:
|
69 |
+
# Use regular expressions to extract numeric part
|
70 |
+
cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
|
71 |
+
if cleaned_value: # If we successfully extract a number
|
72 |
+
values.append(float(cleaned_value[0])) # Convert the first match to float
|
73 |
+
|
74 |
+
# Compute the average if the values list is not empty
|
75 |
+
if values:
|
76 |
+
avg = sum(values) / len(values)
|
77 |
+
if "NO2" in csv_file:
|
78 |
+
NO2.append(avg)
|
79 |
+
else:
|
80 |
+
O3.append(avg)
|
81 |
+
delete_csv(csvs)
|
82 |
+
return NO2, O3
|
83 |
+
|
84 |
+
def add_columns():
|
85 |
+
file_path = 'weather_data.csv'
|
86 |
+
df = pd.read_csv(file_path)
|
87 |
+
|
88 |
+
df.insert(1, 'NO2', None)
|
89 |
+
df.insert(2, 'O3', None)
|
90 |
+
df.insert(10, 'weekday', None)
|
91 |
+
|
92 |
+
return df
|
93 |
+
|
94 |
+
def scale(data):
|
95 |
+
df = data
|
96 |
+
columns = list(df.columns)
|
97 |
+
|
98 |
+
columns.insert(3, columns.pop(6))
|
99 |
+
df = df[columns]
|
100 |
+
|
101 |
+
columns.insert(5, columns.pop(9))
|
102 |
+
df = df[columns]
|
103 |
+
|
104 |
+
columns.insert(9, columns.pop(6))
|
105 |
+
df = df[columns]
|
106 |
+
|
107 |
+
df = df.rename(columns={
|
108 |
+
'datetime':'date',
|
109 |
+
'windspeed': 'wind_speed',
|
110 |
+
'temp': 'mean_temp',
|
111 |
+
'solarradiation':'global_radiation',
|
112 |
+
'precip':'percipitation',
|
113 |
+
'sealevelpressure':'pressure',
|
114 |
+
'visibility':'minimum_visibility'
|
115 |
+
})
|
116 |
+
|
117 |
+
df['date'] = pd.to_datetime(df['date'])
|
118 |
+
df['weekday'] = df['date'].dt.day_name()
|
119 |
+
|
120 |
+
df = df.sort_values(by='date').reset_index(drop=True)
|
121 |
+
|
122 |
+
df['wind_speed'] = (df['wind_speed'] / 3.6) * 10
|
123 |
+
df['mean_temp'] = df['mean_temp'] * 10
|
124 |
+
df['minimum_visibility'] = df['minimum_visibility'] * 10
|
125 |
+
df['percipitation'] = df['percipitation'] * 10
|
126 |
+
df['pressure'] = df['pressure']
|
127 |
+
|
128 |
+
df['wind_speed'] = df['wind_speed'].astype(int)
|
129 |
+
df['mean_temp'] = df['mean_temp'].astype(int)
|
130 |
+
df['minimum_visibility'] = df['minimum_visibility'].astype(int)
|
131 |
+
df['percipitation'] = df['percipitation'].astype(int)
|
132 |
+
df['pressure'] = df['pressure'].astype(int)
|
133 |
+
df['humidity'] = df['humidity'].astype(int)
|
134 |
+
df['global_radiation'] = df['global_radiation'].astype(int)
|
135 |
+
|
136 |
+
return df
|
137 |
+
|
138 |
+
def insert_pollution(NO2, O3, data):
|
139 |
+
df = data
|
140 |
+
df['NO2'] = NO2
|
141 |
+
df['O3'] = O3
|
142 |
+
return df
|
143 |
+
|
144 |
+
def weather_data():
|
145 |
+
# Get last year's same day
|
146 |
+
last_year_date = date.today() - timedelta(days=365)
|
147 |
+
# Start date is 7 days prior
|
148 |
+
start_date = (last_year_date - timedelta(days=7)).isoformat()
|
149 |
+
# End date is 3 days ahead
|
150 |
+
end_date = (last_year_date + timedelta(days=3)).isoformat()
|
151 |
+
try:
|
152 |
+
ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
|
153 |
+
|
154 |
+
# Parse the results as CSV
|
155 |
+
CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
|
156 |
+
# Saving the CSV content to a file
|
157 |
+
current_dir = os.path.dirname(os.path.realpath(__file__))
|
158 |
+
file_path = os.path.join(current_dir, 'weather_data.csv')
|
159 |
+
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
|
160 |
+
csv_writer = csv.writer(csvfile)
|
161 |
+
csv_writer.writerows(CSVText)
|
162 |
+
|
163 |
+
except urllib.error.HTTPError as e:
|
164 |
+
ErrorInfo= e.read().decode()
|
165 |
+
print('Error code: ', e.code, ErrorInfo)
|
166 |
+
sys.exit()
|
167 |
+
except urllib.error.URLError as e:
|
168 |
+
ErrorInfo= e.read().decode()
|
169 |
+
print('Error code: ', e.code,ErrorInfo)
|
170 |
+
sys.exit()
|
171 |
+
|
172 |
+
def weather_data():
|
173 |
+
# Set up dates for last year: 7 days before today last year, and 3 days ahead of this day last year
|
174 |
+
today_last_year = date.today() - timedelta(365)
|
175 |
+
start_last_year = today_last_year - timedelta(8)
|
176 |
+
end_last_year = today_last_year + timedelta(2)
|
177 |
+
|
178 |
+
try:
|
179 |
+
# API call with new date range for last year
|
180 |
+
ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_last_year}/{end_last_year}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv")
|
181 |
+
|
182 |
+
# Parse the results as CSV
|
183 |
+
CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8'))
|
184 |
+
# Saving the CSV content to a file
|
185 |
+
current_dir = os.path.dirname(os.path.realpath(__file__))
|
186 |
+
file_path = os.path.join(current_dir, 'weather_data.csv')
|
187 |
+
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
|
188 |
+
csv_writer = csv.writer(csvfile)
|
189 |
+
csv_writer.writerows(CSVText)
|
190 |
+
|
191 |
+
except urllib.error.HTTPError as e:
|
192 |
+
ErrorInfo = e.read().decode()
|
193 |
+
print('Error code: ', e.code, ErrorInfo)
|
194 |
+
sys.exit()
|
195 |
+
except urllib.error.URLError as e:
|
196 |
+
ErrorInfo = e.read().decode()
|
197 |
+
print('Error code: ', e.code, ErrorInfo)
|
198 |
+
sys.exit()
|
199 |
+
|
200 |
+
|
201 |
+
def get_past_data():
|
202 |
+
weather_data()
|
203 |
+
pollution_data()
|
204 |
+
NO2, O3 = clean_values()
|
205 |
+
df = add_columns()
|
206 |
+
scaled_df = scale(df)
|
207 |
+
output_df = insert_pollution(NO2, O3, scaled_df)
|
208 |
+
os.remove('weather_data.csv')
|
209 |
+
return output_df
|
src/data_loading.py
CHANGED
@@ -1,6 +1,9 @@
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
-
|
|
|
|
|
4 |
|
5 |
def create_features(
|
6 |
data,
|
@@ -83,19 +86,21 @@ def create_features(
|
|
83 |
data[feature].rolling(window=sma_days).mean()
|
84 |
)
|
85 |
|
|
|
86 |
# Create particle data (NO2 and O3) from the same time last year
|
87 |
# Today last year
|
88 |
-
|
89 |
-
data["
|
|
|
90 |
|
91 |
# 7 days before today last year
|
92 |
-
for i in range(1, lag_days
|
93 |
-
data[f"O3_last_year_{i}_days_before"] =
|
94 |
-
data[f"NO2_last_year_{i}_days_before"] =
|
95 |
|
96 |
# 3 days after today last year
|
97 |
-
data["O3_last_year_3_days_after"] =
|
98 |
-
data["NO2_last_year_3_days_after"] =
|
99 |
|
100 |
# Calculate the number of rows before dropping missing values
|
101 |
rows_before = data.shape[0]
|
|
|
1 |
+
import joblib
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
+
|
5 |
+
from past_data_api_calls import get_past_data
|
6 |
+
|
7 |
|
8 |
def create_features(
|
9 |
data,
|
|
|
86 |
data[feature].rolling(window=sma_days).mean()
|
87 |
)
|
88 |
|
89 |
+
past_data = get_past_data()
|
90 |
# Create particle data (NO2 and O3) from the same time last year
|
91 |
# Today last year
|
92 |
+
|
93 |
+
data["O3_last_year"] = past_data["O3"].iloc[-4] # data["O3_last_year"] = data["O3"].shift(365)
|
94 |
+
data["NO2_last_year"] = past_data["NO2"].iloc[-4] # data["NO2_last_year"] = data["NO2"].shift(365)
|
95 |
|
96 |
# 7 days before today last year
|
97 |
+
for i in range(1, lag_days+1):
|
98 |
+
data[f"O3_last_year_{i}_days_before"] = past_data["O3"].iloc[i-1] # data["O3"].shift(365 + i)
|
99 |
+
data[f"NO2_last_year_{i}_days_before"] = past_data["NO2"].iloc[i-1] # data["NO2"].shift(365 + i)
|
100 |
|
101 |
# 3 days after today last year
|
102 |
+
data["O3_last_year_3_days_after"] = past_data["O3"].iloc[-1] # data["O3"].shift(365 - 3)
|
103 |
+
data["NO2_last_year_3_days_after"] = past_data["NO2"].iloc[-1] # data["NO2"].shift(365 - 3)
|
104 |
|
105 |
# Calculate the number of rows before dropping missing values
|
106 |
rows_before = data.shape[0]
|