Spaces:

Annikaijak
/

Air_quality

Sleeping

File size: 12,746 Bytes

57dbe6b

import os
import datetime
import time
import requests
import pandas as pd
import json

from geopy.geocoders import Nominatim




def convert_date_to_unix(x):
    """
    Convert datetime to unix time in milliseconds.
    """
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = int(dt_obj.timestamp() * 1000)
    return dt_obj


def get_city_coordinates(city_name: str):
    """
    Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
    """ 
    # Initialize Nominatim API (for getting lat and long of the city)
    geolocator = Nominatim(user_agent="MyApp")
    city = geolocator.geocode(city_name)

    latitude = round(city.latitude, 2)
    longitude = round(city.longitude, 2)
    
    return latitude, longitude


##################################### EEA
def convert_to_daily(df, pollutant: str):
    """
    Returns DataFrame where pollutant column is resampled to days and rounded.
    """
    res_df = df.copy()
    # convert dates in 'time' column
    res_df["date"] = pd.to_datetime(res_df["date"])
    
    # I want data daily, not hourly (mean per each day = 1 datarow per 1 day)
    res_df = res_df.set_index('date')
    res_df = res_df[pollutant].resample('1d').mean().reset_index()
    res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median())
    res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0))
    
    return res_df


def find_fullest_csv(csv_links: list, year: str):
    candidates = [link for link in csv_links if str(year) in link]
    biggest_df = pd.read_csv(candidates[0])
    for link in candidates[1:]:
        _df = pd.read_csv(link)
        if len(biggest_df) < len(_df):
            biggest_df = _df
    return biggest_df


def get_air_quality_from_eea(city_name: str,
                             pollutant: str,
                             start_year: str,
                             end_year: str):
    """
    Takes city name, daterange and returns pandas DataFrame with daily air quality data.
    It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...)
    
    EEA means European Environmental Agency. So it has data for Europe Union countries ONLY.
    """
    start_of_cell = time.time()
    
    params = {
        'CountryCode': '',
        'CityName': city_name,
        'Pollutant': pollutant.upper(),
        'Year_from': start_year,
        'Year_to': end_year,
        'Station': '',
        'Source': 'All',
        'Samplingpoint': '',
        'Output': 'TEXT',
        'UpdateDate': '',
        'TimeCoverage': 'Year'
    }

    # observations endpoint
    base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?"
    try:
        response = requests.get(base_url, params=params)
    except ConnectionError:
        response = requests.get(base_url, params=params)
        
    response.encoding = response.apparent_encoding
    csv_links = response.text.split("\r\n")
    
    res_df = pd.DataFrame()
    target_year = int(start_year)
    
    for year in range(int(start_year), int(end_year) + 1):
        try:
            # find the fullest, the biggest csv file with observations for this particular year
            _df = find_fullest_csv(csv_links, year)
            # append it to res_df
            res_df = pd.concat([res_df, _df])
        except IndexError:
            print(f"!! Missing data for {year} for {city} city.")
            pass
        
    pollutant = pollutant.lower()
    if pollutant == "pm2.5":
        pollutant = "pm2_5"
        
    res_df = res_df.rename(columns={
        'DatetimeBegin': 'date',
        'Concentration': pollutant        
    })
    
    # cut timezones info
    res_df['date'] = res_df['date'].apply(lambda x: x[:-6])
    # convert dates in 'time' column
    res_df['date'] = pd.to_datetime(res_df['date'])
    
    res_df = convert_to_daily(res_df, pollutant)
    
    res_df['city_name'] = city_name
    res_df = res_df[['city_name', 'date', pollutant.lower()]]
    
    end_of_cell = time.time()
    
    print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df



##################################### USEPA
city_code_dict = {}
pollutant_dict = {
    'CO': '42101',
    'SO2': '42401',
    'NO2': '42602',
    'O3': '44201',
    'PM10': '81102',
    'PM2.5': '88101'
}

def get_city_code(city_name: str):
    "Encodes city name to be used later for data parsing using USEPA."
    if city_code_dict:
        city_full = [i for i in city_code_dict.keys() if city_name in i][0]
        return city_code_dict[city_full]
    else:
        params = {
            "email": "test@aqs.api",
            "key": "test"
        }
        response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params)
        response_json = response.json()
        data = response_json["Data"]
        for item in data:
            city_code_dict[item['value_represented']] = item['code']
        
        return get_city_code(city_name)


def get_air_quality_from_usepa(city_name: str,
                               pollutant: str,
                               start_date: str,
                               end_date: str):
    """
    Takes city name, daterange and returns pandas DataFrame with daily air quality data.
    
    USEPA means United States Environmental Protection Agency. So it has data for US ONLY.
    """
    start_of_cell = time.time()   
    res_df = pd.DataFrame()
    
    for start_date_, end_date_ in make_date_intervals(start_date, end_date):
        params = {
            "email": "test@aqs.api",
            "key": "test",
            "param": pollutant_dict[pollutant.upper().replace("_", ".")], # encoded pollutant 
            "bdate": start_date_,
            "edate": end_date_,
            "cbsa": get_city_code(city_name) # Core-based statistical area
        }

        # observations endpoint
        base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?" 

        response = requests.get(base_url, params=params)
        response_json = response.json()
        
        df_ = pd.DataFrame(response_json["Data"])
        
        pollutant = pollutant.lower()      
        if pollutant == "pm2.5":
            pollutant = "pm2_5"
        df_ = df_.rename(columns={
            'date_local': 'date',
            'arithmetic_mean': pollutant        
        })

        # convert dates in 'date' column
        df_['date'] = pd.to_datetime(df_['date'])
        df_['city_name'] = city_name    
        df_ = df_[['city_name', 'date', pollutant]]
        res_df = pd.concat([res_df, df_])
    
    # there are duplicated rows (several records for the same day and station). get rid of it.
    res_df = res_df.groupby(['date', 'city_name'], as_index=False)[pollutant].mean()
    res_df[pollutant] = round(res_df[pollutant], 1)  
    
    end_of_cell = time.time()
    print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df


def make_date_intervals(start_date, end_date):
    start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    date_intervals = []
    for year in range(start_dt.year, end_dt.year + 1):
        year_start = datetime.datetime(year, 1, 1)
        year_end = datetime.datetime(year, 12, 31)
        interval_start = max(start_dt, year_start)
        interval_end = min(end_dt, year_end)
        if interval_start < interval_end:
            date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d')))
    return date_intervals

##################################### Weather Open Meteo
def get_weather_data_from_open_meteo(city_name: str,
                                     start_date: str,
                                     end_date: str,
                                     coordinates: list = None,
                                     forecast: bool = False):
    """
    Takes [city name OR coordinates] and returns pandas DataFrame with weather data.
    
    Examples of arguments:
        coordinates=(47.755, -122.2806), start_date="2023-01-01"
    """
    start_of_cell = time.time()
    
    if coordinates:
        latitude, longitude = coordinates
    else:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'daily': ["temperature_2m_max", "temperature_2m_min",
                  "precipitation_sum", "rain_sum", "snowfall_sum",
                  "precipitation_hours", "windspeed_10m_max",
                  "windgusts_10m_max", "winddirection_10m_dominant"],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    if forecast:
        # historical forecast endpoint
        base_url = 'https://api.open-meteo.com/v1/forecast' 
    else:
        # historical observations endpoint
        base_url = 'https://archive-api.open-meteo.com/v1/archive'  
        
    try:
        response = requests.get(base_url, params=params)
    except ConnectionError:
        response = requests.get(base_url, params=params)
    
    response_json = response.json()    
    res_df = pd.DataFrame(response_json["daily"])  
    res_df["city_name"] = city_name
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date",
        "temperature_2m_max": "temperature_max",
        "temperature_2m_min": "temperature_min",
        "windspeed_10m_max": "wind_speed_max",
        "winddirection_10m_dominant": "wind_direction_dominant",
        "windgusts_10m_max": "wind_gusts_max"
    })
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', 'temperature_max', 'temperature_min',
         'precipitation_sum', 'rain_sum', 'snowfall_sum',
         'precipitation_hours', 'wind_speed_max',
         'wind_gusts_max', 'wind_direction_dominant']
    ]
    
    # convert dates in 'date' column
    res_df["date"] = pd.to_datetime(res_df["date"])
    end_of_cell = time.time()
    print(f"Parsed weather for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
        
    return res_df


##################################### Air Quality data from Open Meteo
def get_aqi_data_from_open_meteo(city_name: str,
                                 start_date: str,
                                 end_date: str,
                                 coordinates: list = None,
                                 pollutant: str = "pm2_5"):
    """
    Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
    
    Examples of arguments:
        ...
        coordinates=(47.755, -122.2806),
        start_date="2023-01-01",
        pollutant="no2"
        ...
    """
    start_of_cell = time.time()
    
    if coordinates:
        latitude, longitude = coordinates
    else:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    pollutant = pollutant.lower()
    if pollutant == "pm2.5":
        pollutant = "pm2_5"
    
    # make it work with both "no2" and "nitrogen_dioxide" passed.
    if pollutant == "no2":
        pollutant = "nitrogen_dioxide"
        
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'hourly': [pollutant],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    # base endpoint
    base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"   
    try:
        response = requests.get(base_url, params=params)
    except ConnectionError:
        response = requests.get(base_url, params=params)
    response_json = response.json()    
    res_df = pd.DataFrame(response_json["hourly"])       
    
    # convert dates
    res_df["time"] = pd.to_datetime(res_df["time"])
    
    # resample to days
    res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
    res_df[pollutant] = round(res_df[pollutant], 1)
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date"
    })
    
    res_df["city_name"] = city_name
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', pollutant]
    ]
    end_of_cell = time.time()
    print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df