import pandas as pd from geopy.geocoders import Nominatim import sys import os def initialize_geolocator(user_agent="county_locator"): """Initialize the geolocator object.""" return Nominatim(user_agent=user_agent) def get_county_from_coordinates(latitude, longitude, geolocator): """Retrieve county name from coordinates using the geolocator.""" location = geolocator.reverse((latitude, longitude), language="en") if location: return location.raw.get("address", {}).get("county", "Unknown") return "Unknown" def add_county_column(df): """Add county column to DataFrame based on coordinates.""" geolocator = initialize_geolocator() coordinates = df[['longitude', 'latitude']].drop_duplicates() coordinates['county'] = coordinates.apply( lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1 ) df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude']) df = df.drop(['longitude', 'latitude'], axis=1) return df def filter_estonian_counties(df): """Filter rows by Estonian counties and map county names to integers.""" county_locations = [ 'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County', 'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County', 'Harju County', 'Lääne-Viru County', 'Ida-Viru County' ] county_to_int = { 'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14, 'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6, 'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2 } df = df[df['county'].isin(county_locations)] df.loc[:, 'county'] = df['county'].map(county_to_int) return df def filter_data(train, client, weather, is_business, product_type, county_code): """Filter and split train data based on is_business, product_type, county_code and is_consumption.""" train = train[ (train['is_business'] == is_business) & (train['product_type'] == product_type) & (train['county'] == county_code) ] train = train.drop(['is_business', 'product_type', 'county'], axis=1) train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1) client = client[ (client['is_business'] == is_business) & (client['product_type'] == product_type) & (client['county'] == county_code) ] client = client.drop(['is_business', 'product_type', 'county'], axis=1) weather = weather[weather['county'] == county_code] weather = weather.drop(['county'], axis=1) return train, client, weather def save_datasets_to_pickle(datasets, paths=None): """Save each dataset in datasets list to the corresponding path in paths list as a pickle file.""" # Obtén el directorio del archivo actual base_dir = os.path.dirname(os.path.abspath(__file__)) if paths is None: paths = [ os.path.join(base_dir, 'process_files', 'generation.pkl'), os.path.join(base_dir, 'process_files', 'client.pkl'), os.path.join(base_dir, 'process_files', 'historical_weather.pkl'), os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'), os.path.join(base_dir, 'process_files', 'gas_prices.pkl'), ] # Guardar cada dataset en su respectiva ruta for dataset, path in zip(datasets, paths): dataset.to_pickle(path) def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices): # Filter parameters is_business, product_type, county_code = 1, 3, 0 # Drop unnecessary columns and change date columns to datetime type datasets_info = [ [train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']], [client, ['data_block_id'], ['date']], [historical_weather, ['data_block_id'], ['datetime']], [electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']], [gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']] ] # [df, [drop_cols], [date_cols]] for df, drop_cols, date_cols in datasets_info: df.drop(drop_cols, axis=1, inplace=True) for col in date_cols: df[col] = pd.to_datetime(df[col]) # Add county and filter weather data historical_weather = add_county_column(historical_weather) historical_weather = filter_estonian_counties(historical_weather) # Group weather data by day historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index() # Filter data by is_business, product_type, county_code train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code) # Save datasets to pickle files save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])