Annikaijak commited on
Commit
57dbe6b
1 Parent(s): 24196a1

Upload functions.py

Browse files
Files changed (1) hide show
  1. functions.py +385 -0
functions.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ import json
7
+
8
+ from geopy.geocoders import Nominatim
9
+
10
+
11
+
12
+
13
+ def convert_date_to_unix(x):
14
+ """
15
+ Convert datetime to unix time in milliseconds.
16
+ """
17
+ dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
18
+ dt_obj = int(dt_obj.timestamp() * 1000)
19
+ return dt_obj
20
+
21
+
22
+ def get_city_coordinates(city_name: str):
23
+ """
24
+ Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
25
+ """
26
+ # Initialize Nominatim API (for getting lat and long of the city)
27
+ geolocator = Nominatim(user_agent="MyApp")
28
+ city = geolocator.geocode(city_name)
29
+
30
+ latitude = round(city.latitude, 2)
31
+ longitude = round(city.longitude, 2)
32
+
33
+ return latitude, longitude
34
+
35
+
36
+ ##################################### EEA
37
+ def convert_to_daily(df, pollutant: str):
38
+ """
39
+ Returns DataFrame where pollutant column is resampled to days and rounded.
40
+ """
41
+ res_df = df.copy()
42
+ # convert dates in 'time' column
43
+ res_df["date"] = pd.to_datetime(res_df["date"])
44
+
45
+ # I want data daily, not hourly (mean per each day = 1 datarow per 1 day)
46
+ res_df = res_df.set_index('date')
47
+ res_df = res_df[pollutant].resample('1d').mean().reset_index()
48
+ res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median())
49
+ res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0))
50
+
51
+ return res_df
52
+
53
+
54
+ def find_fullest_csv(csv_links: list, year: str):
55
+ candidates = [link for link in csv_links if str(year) in link]
56
+ biggest_df = pd.read_csv(candidates[0])
57
+ for link in candidates[1:]:
58
+ _df = pd.read_csv(link)
59
+ if len(biggest_df) < len(_df):
60
+ biggest_df = _df
61
+ return biggest_df
62
+
63
+
64
+ def get_air_quality_from_eea(city_name: str,
65
+ pollutant: str,
66
+ start_year: str,
67
+ end_year: str):
68
+ """
69
+ Takes city name, daterange and returns pandas DataFrame with daily air quality data.
70
+ It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...)
71
+
72
+ EEA means European Environmental Agency. So it has data for Europe Union countries ONLY.
73
+ """
74
+ start_of_cell = time.time()
75
+
76
+ params = {
77
+ 'CountryCode': '',
78
+ 'CityName': city_name,
79
+ 'Pollutant': pollutant.upper(),
80
+ 'Year_from': start_year,
81
+ 'Year_to': end_year,
82
+ 'Station': '',
83
+ 'Source': 'All',
84
+ 'Samplingpoint': '',
85
+ 'Output': 'TEXT',
86
+ 'UpdateDate': '',
87
+ 'TimeCoverage': 'Year'
88
+ }
89
+
90
+ # observations endpoint
91
+ base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?"
92
+ try:
93
+ response = requests.get(base_url, params=params)
94
+ except ConnectionError:
95
+ response = requests.get(base_url, params=params)
96
+
97
+ response.encoding = response.apparent_encoding
98
+ csv_links = response.text.split("\r\n")
99
+
100
+ res_df = pd.DataFrame()
101
+ target_year = int(start_year)
102
+
103
+ for year in range(int(start_year), int(end_year) + 1):
104
+ try:
105
+ # find the fullest, the biggest csv file with observations for this particular year
106
+ _df = find_fullest_csv(csv_links, year)
107
+ # append it to res_df
108
+ res_df = pd.concat([res_df, _df])
109
+ except IndexError:
110
+ print(f"!! Missing data for {year} for {city} city.")
111
+ pass
112
+
113
+ pollutant = pollutant.lower()
114
+ if pollutant == "pm2.5":
115
+ pollutant = "pm2_5"
116
+
117
+ res_df = res_df.rename(columns={
118
+ 'DatetimeBegin': 'date',
119
+ 'Concentration': pollutant
120
+ })
121
+
122
+ # cut timezones info
123
+ res_df['date'] = res_df['date'].apply(lambda x: x[:-6])
124
+ # convert dates in 'time' column
125
+ res_df['date'] = pd.to_datetime(res_df['date'])
126
+
127
+ res_df = convert_to_daily(res_df, pollutant)
128
+
129
+ res_df['city_name'] = city_name
130
+ res_df = res_df[['city_name', 'date', pollutant.lower()]]
131
+
132
+ end_of_cell = time.time()
133
+
134
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.")
135
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
136
+
137
+ return res_df
138
+
139
+
140
+
141
+ ##################################### USEPA
142
+ city_code_dict = {}
143
+ pollutant_dict = {
144
+ 'CO': '42101',
145
+ 'SO2': '42401',
146
+ 'NO2': '42602',
147
+ 'O3': '44201',
148
+ 'PM10': '81102',
149
+ 'PM2.5': '88101'
150
+ }
151
+
152
+ def get_city_code(city_name: str):
153
+ "Encodes city name to be used later for data parsing using USEPA."
154
+ if city_code_dict:
155
+ city_full = [i for i in city_code_dict.keys() if city_name in i][0]
156
+ return city_code_dict[city_full]
157
+ else:
158
+ params = {
159
+ "email": "test@aqs.api",
160
+ "key": "test"
161
+ }
162
+ response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params)
163
+ response_json = response.json()
164
+ data = response_json["Data"]
165
+ for item in data:
166
+ city_code_dict[item['value_represented']] = item['code']
167
+
168
+ return get_city_code(city_name)
169
+
170
+
171
+ def get_air_quality_from_usepa(city_name: str,
172
+ pollutant: str,
173
+ start_date: str,
174
+ end_date: str):
175
+ """
176
+ Takes city name, daterange and returns pandas DataFrame with daily air quality data.
177
+
178
+ USEPA means United States Environmental Protection Agency. So it has data for US ONLY.
179
+ """
180
+ start_of_cell = time.time()
181
+ res_df = pd.DataFrame()
182
+
183
+ for start_date_, end_date_ in make_date_intervals(start_date, end_date):
184
+ params = {
185
+ "email": "test@aqs.api",
186
+ "key": "test",
187
+ "param": pollutant_dict[pollutant.upper().replace("_", ".")], # encoded pollutant
188
+ "bdate": start_date_,
189
+ "edate": end_date_,
190
+ "cbsa": get_city_code(city_name) # Core-based statistical area
191
+ }
192
+
193
+ # observations endpoint
194
+ base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?"
195
+
196
+ response = requests.get(base_url, params=params)
197
+ response_json = response.json()
198
+
199
+ df_ = pd.DataFrame(response_json["Data"])
200
+
201
+ pollutant = pollutant.lower()
202
+ if pollutant == "pm2.5":
203
+ pollutant = "pm2_5"
204
+ df_ = df_.rename(columns={
205
+ 'date_local': 'date',
206
+ 'arithmetic_mean': pollutant
207
+ })
208
+
209
+ # convert dates in 'date' column
210
+ df_['date'] = pd.to_datetime(df_['date'])
211
+ df_['city_name'] = city_name
212
+ df_ = df_[['city_name', 'date', pollutant]]
213
+ res_df = pd.concat([res_df, df_])
214
+
215
+ # there are duplicated rows (several records for the same day and station). get rid of it.
216
+ res_df = res_df.groupby(['date', 'city_name'], as_index=False)[pollutant].mean()
217
+ res_df[pollutant] = round(res_df[pollutant], 1)
218
+
219
+ end_of_cell = time.time()
220
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
221
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
222
+
223
+ return res_df
224
+
225
+
226
+ def make_date_intervals(start_date, end_date):
227
+ start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
228
+ end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d')
229
+ date_intervals = []
230
+ for year in range(start_dt.year, end_dt.year + 1):
231
+ year_start = datetime.datetime(year, 1, 1)
232
+ year_end = datetime.datetime(year, 12, 31)
233
+ interval_start = max(start_dt, year_start)
234
+ interval_end = min(end_dt, year_end)
235
+ if interval_start < interval_end:
236
+ date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d')))
237
+ return date_intervals
238
+
239
+ ##################################### Weather Open Meteo
240
+ def get_weather_data_from_open_meteo(city_name: str,
241
+ start_date: str,
242
+ end_date: str,
243
+ coordinates: list = None,
244
+ forecast: bool = False):
245
+ """
246
+ Takes [city name OR coordinates] and returns pandas DataFrame with weather data.
247
+
248
+ Examples of arguments:
249
+ coordinates=(47.755, -122.2806), start_date="2023-01-01"
250
+ """
251
+ start_of_cell = time.time()
252
+
253
+ if coordinates:
254
+ latitude, longitude = coordinates
255
+ else:
256
+ latitude, longitude = get_city_coordinates(city_name=city_name)
257
+
258
+ params = {
259
+ 'latitude': latitude,
260
+ 'longitude': longitude,
261
+ 'daily': ["temperature_2m_max", "temperature_2m_min",
262
+ "precipitation_sum", "rain_sum", "snowfall_sum",
263
+ "precipitation_hours", "windspeed_10m_max",
264
+ "windgusts_10m_max", "winddirection_10m_dominant"],
265
+ 'start_date': start_date,
266
+ 'end_date': end_date,
267
+ 'timezone': "Europe/London"
268
+ }
269
+
270
+ if forecast:
271
+ # historical forecast endpoint
272
+ base_url = 'https://api.open-meteo.com/v1/forecast'
273
+ else:
274
+ # historical observations endpoint
275
+ base_url = 'https://archive-api.open-meteo.com/v1/archive'
276
+
277
+ try:
278
+ response = requests.get(base_url, params=params)
279
+ except ConnectionError:
280
+ response = requests.get(base_url, params=params)
281
+
282
+ response_json = response.json()
283
+ res_df = pd.DataFrame(response_json["daily"])
284
+ res_df["city_name"] = city_name
285
+
286
+ # rename columns
287
+ res_df = res_df.rename(columns={
288
+ "time": "date",
289
+ "temperature_2m_max": "temperature_max",
290
+ "temperature_2m_min": "temperature_min",
291
+ "windspeed_10m_max": "wind_speed_max",
292
+ "winddirection_10m_dominant": "wind_direction_dominant",
293
+ "windgusts_10m_max": "wind_gusts_max"
294
+ })
295
+
296
+ # change columns order
297
+ res_df = res_df[
298
+ ['city_name', 'date', 'temperature_max', 'temperature_min',
299
+ 'precipitation_sum', 'rain_sum', 'snowfall_sum',
300
+ 'precipitation_hours', 'wind_speed_max',
301
+ 'wind_gusts_max', 'wind_direction_dominant']
302
+ ]
303
+
304
+ # convert dates in 'date' column
305
+ res_df["date"] = pd.to_datetime(res_df["date"])
306
+ end_of_cell = time.time()
307
+ print(f"Parsed weather for {city_name} since {start_date} till {end_date}.")
308
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
309
+
310
+ return res_df
311
+
312
+
313
+ ##################################### Air Quality data from Open Meteo
314
+ def get_aqi_data_from_open_meteo(city_name: str,
315
+ start_date: str,
316
+ end_date: str,
317
+ coordinates: list = None,
318
+ pollutant: str = "pm2_5"):
319
+ """
320
+ Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
321
+
322
+ Examples of arguments:
323
+ ...
324
+ coordinates=(47.755, -122.2806),
325
+ start_date="2023-01-01",
326
+ pollutant="no2"
327
+ ...
328
+ """
329
+ start_of_cell = time.time()
330
+
331
+ if coordinates:
332
+ latitude, longitude = coordinates
333
+ else:
334
+ latitude, longitude = get_city_coordinates(city_name=city_name)
335
+
336
+ pollutant = pollutant.lower()
337
+ if pollutant == "pm2.5":
338
+ pollutant = "pm2_5"
339
+
340
+ # make it work with both "no2" and "nitrogen_dioxide" passed.
341
+ if pollutant == "no2":
342
+ pollutant = "nitrogen_dioxide"
343
+
344
+ params = {
345
+ 'latitude': latitude,
346
+ 'longitude': longitude,
347
+ 'hourly': [pollutant],
348
+ 'start_date': start_date,
349
+ 'end_date': end_date,
350
+ 'timezone': "Europe/London"
351
+ }
352
+
353
+ # base endpoint
354
+ base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
355
+ try:
356
+ response = requests.get(base_url, params=params)
357
+ except ConnectionError:
358
+ response = requests.get(base_url, params=params)
359
+ response_json = response.json()
360
+ res_df = pd.DataFrame(response_json["hourly"])
361
+
362
+ # convert dates
363
+ res_df["time"] = pd.to_datetime(res_df["time"])
364
+
365
+ # resample to days
366
+ res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
367
+ res_df[pollutant] = round(res_df[pollutant], 1)
368
+
369
+ # rename columns
370
+ res_df = res_df.rename(columns={
371
+ "time": "date"
372
+ })
373
+
374
+ res_df["city_name"] = city_name
375
+
376
+ # change columns order
377
+ res_df = res_df[
378
+ ['city_name', 'date', pollutant]
379
+ ]
380
+ end_of_cell = time.time()
381
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
382
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
383
+
384
+ return res_df
385
+