import codecs import csv import http.client import os import re import sys import urllib.request from datetime import date, timedelta from io import StringIO import pandas as pd def pollution_data(): particles = ["NO2", "O3"] stations = ["NL10636", "NL10639", "NL10643"] all_dataframes = [] today = date.today().isoformat() + "T09:00:00Z" yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z" latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z" days_today = 0 days_yesterday = 1 while(today != latest_date): days_today += 1 days_yesterday += 1 for particle in particles: for station in stations: conn = http.client.HTTPSConnection("api.luchtmeetnet.nl") payload = '' headers = {} conn.request("GET", f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}", payload, headers) res = conn.getresponse() data = res.read() decoded_data = data.decode("utf-8") df = pd.read_csv(StringIO(decoded_data)) df = df.filter(like='value') all_dataframes.append(df) combined_data = pd.concat(all_dataframes, ignore_index=True) combined_data.to_csv(f'{particle}_{today}.csv', index=False) today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z" yesterday = (date.today() - timedelta(days_yesterday)).isoformat() + "T09:00:00Z" def delete_csv(csvs): for csv in csvs: if(os.path.exists(csv) and os.path.isfile(csv)): os.remove(csv) def clean_values(): particles = ["NO2", "O3"] csvs = [] NO2 = [] O3 = [] today = date.today().isoformat() + "T09:00:00Z" yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z" latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z" days_today = 0 while(today != latest_date): for particle in particles: name = f'{particle}_{today}.csv' csvs.append(name) days_today += 1 today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z" for csv_file in csvs: values = [] # Reset values for each CSV file # Open the CSV file and read the values with open(csv_file, 'r') as file: reader = csv.reader(file) for row in reader: for value in row: # Use regular expressions to extract numeric part cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value) if cleaned_value: # If we successfully extract a number values.append(float(cleaned_value[0])) # Convert the first match to float # Compute the average if the values list is not empty if values: avg = sum(values) / len(values) if "NO2" in csv_file: NO2.append(avg) else: O3.append(avg) delete_csv(csvs) return NO2, O3 def add_columns(): file_path = 'weather_data.csv' df = pd.read_csv(file_path) df.insert(1, 'NO2', None) df.insert(2, 'O3', None) df.insert(10, 'weekday', None) return df def scale(data): df = data columns = list(df.columns) columns.insert(3, columns.pop(6)) df = df[columns] columns.insert(5, columns.pop(9)) df = df[columns] columns.insert(9, columns.pop(6)) df = df[columns] df = df.rename(columns={ 'datetime':'date', 'windspeed': 'wind_speed', 'temp': 'mean_temp', 'solarradiation':'global_radiation', 'precip':'percipitation', 'sealevelpressure':'pressure', 'visibility':'minimum_visibility' }) df['date'] = pd.to_datetime(df['date']) df['weekday'] = df['date'].dt.day_name() df['wind_speed'] = (df['wind_speed'] / 3.6) * 10 df['mean_temp'] = df['mean_temp'] * 10 df['minimum_visibility'] = df['minimum_visibility'] * 10 df['percipitation'] = df['percipitation'] * 10 df['pressure'] = df['pressure'] * 10 df['wind_speed'] = df['wind_speed'].astype(int) df['mean_temp'] = df['mean_temp'].astype(int) df['minimum_visibility'] = df['minimum_visibility'].astype(int) df['percipitation'] = df['percipitation'].astype(int) df['pressure'] = df['pressure'].astype(int) df['humidity'] = df['humidity'].astype(int) df['global_radiation'] = df['global_radiation'].astype(int) return df def insert_pollution(NO2, O3, data): df = data start_index = 0 while NO2: df.loc[start_index, 'NO2'] = NO2.pop() start_index += 1 start_index = 0 while O3: df.loc[start_index, 'O3'] = O3.pop() start_index += 1 df.to_csv('dataset.csv', index=False) def weather_data(): today = date.today().isoformat() seven_days = (date.today() - timedelta(7)).isoformat() try: ResultBytes = urllib.request.urlopen(f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{seven_days}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv") # Parse the results as CSV CSVText = csv.reader(codecs.iterdecode(ResultBytes, 'utf-8')) # Saving the CSV content to a file current_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(current_dir, 'weather_data.csv') with open(file_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerows(CSVText) except urllib.error.HTTPError as e: ErrorInfo= e.read().decode() print('Error code: ', e.code, ErrorInfo) sys.exit() except urllib.error.URLError as e: ErrorInfo= e.read().decode() print('Error code: ', e.code,ErrorInfo) sys.exit() def get_data(): weather_data() pollution_data() NO2, O3 = clean_values() df = add_columns() scaled_df = scale(df) insert_pollution(NO2, O3, scaled_df) os.remove('weather_data.csv')