Spaces:

josegoji
/

PI2

Sleeping

PI2

File size: 5,276 Bytes

115745a

import pandas as pd
from geopy.geocoders import Nominatim
import sys
import os


def initialize_geolocator(user_agent="county_locator"):
    """Initialize the geolocator object."""
    return Nominatim(user_agent=user_agent)


def get_county_from_coordinates(latitude, longitude, geolocator):
    """Retrieve county name from coordinates using the geolocator."""
    location = geolocator.reverse((latitude, longitude), language="en")
    if location:
        return location.raw.get("address", {}).get("county", "Unknown")
    return "Unknown"


def add_county_column(df):
    """Add county column to DataFrame based on coordinates."""
    geolocator = initialize_geolocator()
    coordinates = df[['longitude', 'latitude']].drop_duplicates()
    coordinates['county'] = coordinates.apply(
        lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
    )
    df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
    df = df.drop(['longitude', 'latitude'], axis=1)
    return df


def filter_estonian_counties(df):
    """Filter rows by Estonian counties and map county names to integers."""
    county_locations = [
        'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
        'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
        'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
    ]
    county_to_int = {
        'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
        'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
        'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
    }
    df = df[df['county'].isin(county_locations)]
    df.loc[:, 'county'] = df['county'].map(county_to_int)
    return df


def filter_data(train, client, weather, is_business, product_type, county_code):
    """Filter and split train data based on is_business, product_type, county_code and is_consumption."""
    train = train[
        (train['is_business'] == is_business) &
        (train['product_type'] == product_type) &
        (train['county'] == county_code)
    ]
    train = train.drop(['is_business', 'product_type', 'county'], axis=1)
    train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
    

    client = client[
        (client['is_business'] == is_business) &
        (client['product_type'] == product_type) &
        (client['county'] == county_code)
    ]
    client = client.drop(['is_business', 'product_type', 'county'], axis=1)

    weather = weather[weather['county'] == county_code]
    weather = weather.drop(['county'], axis=1)

    return train, client, weather

def save_datasets_to_pickle(datasets, paths=None):
    """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
    # Obtén el directorio del archivo actual
    base_dir = os.path.dirname(os.path.abspath(__file__))

    if paths is None:
        paths = [
            os.path.join(base_dir, 'process_files', 'generation.pkl'),
            os.path.join(base_dir, 'process_files', 'client.pkl'),
            os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
            os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
            os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
        ]

    # Guardar cada dataset en su respectiva ruta
    for dataset, path in zip(datasets, paths):
        dataset.to_pickle(path)



def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
    # Filter parameters
    is_business, product_type, county_code = 1, 3, 0

    # Drop unnecessary columns and change date columns to datetime type
    datasets_info = [
        [train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
        [client, ['data_block_id'], ['date']],
        [historical_weather, ['data_block_id'], ['datetime']],
        [electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
        [gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
    ] # [df, [drop_cols], [date_cols]]

    for df, drop_cols, date_cols in datasets_info:
        df.drop(drop_cols, axis=1, inplace=True)
        for col in date_cols:
            df[col] = pd.to_datetime(df[col])

    # Add county and filter weather data 
    historical_weather = add_county_column(historical_weather)
    historical_weather = filter_estonian_counties(historical_weather)
    # Group weather data by day
    historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()

    # Filter data by is_business, product_type, county_code
    train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
    
    # Save datasets to pickle files
    save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])