|
import pandas as pd
|
|
from geopy.geocoders import Nominatim
|
|
import sys
|
|
import os
|
|
|
|
|
|
def initialize_geolocator(user_agent="county_locator"):
|
|
"""Initialize the geolocator object."""
|
|
return Nominatim(user_agent=user_agent)
|
|
|
|
|
|
def get_county_from_coordinates(latitude, longitude, geolocator):
|
|
"""Retrieve county name from coordinates using the geolocator."""
|
|
location = geolocator.reverse((latitude, longitude), language="en")
|
|
if location:
|
|
return location.raw.get("address", {}).get("county", "Unknown")
|
|
return "Unknown"
|
|
|
|
|
|
def add_county_column(df):
|
|
"""Add county column to DataFrame based on coordinates."""
|
|
geolocator = initialize_geolocator()
|
|
coordinates = df[['longitude', 'latitude']].drop_duplicates()
|
|
coordinates['county'] = coordinates.apply(
|
|
lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
|
|
)
|
|
df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
|
|
df = df.drop(['longitude', 'latitude'], axis=1)
|
|
return df
|
|
|
|
|
|
def filter_estonian_counties(df):
|
|
"""Filter rows by Estonian counties and map county names to integers."""
|
|
county_locations = [
|
|
'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
|
|
'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
|
|
'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
|
|
]
|
|
county_to_int = {
|
|
'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
|
|
'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
|
|
'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
|
|
}
|
|
df = df[df['county'].isin(county_locations)]
|
|
df.loc[:, 'county'] = df['county'].map(county_to_int)
|
|
return df
|
|
|
|
|
|
def filter_data(train, client, weather, is_business, product_type, county_code):
|
|
"""Filter and split train data based on is_business, product_type, county_code and is_consumption."""
|
|
train = train[
|
|
(train['is_business'] == is_business) &
|
|
(train['product_type'] == product_type) &
|
|
(train['county'] == county_code)
|
|
]
|
|
train = train.drop(['is_business', 'product_type', 'county'], axis=1)
|
|
train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
|
|
|
|
|
|
client = client[
|
|
(client['is_business'] == is_business) &
|
|
(client['product_type'] == product_type) &
|
|
(client['county'] == county_code)
|
|
]
|
|
client = client.drop(['is_business', 'product_type', 'county'], axis=1)
|
|
|
|
weather = weather[weather['county'] == county_code]
|
|
weather = weather.drop(['county'], axis=1)
|
|
|
|
return train, client, weather
|
|
|
|
def save_datasets_to_pickle(datasets, paths=None):
|
|
"""Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
|
|
|
|
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
if paths is None:
|
|
paths = [
|
|
os.path.join(base_dir, 'process_files', 'generation.pkl'),
|
|
os.path.join(base_dir, 'process_files', 'client.pkl'),
|
|
os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
|
|
os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
|
|
os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
|
|
]
|
|
|
|
|
|
for dataset, path in zip(datasets, paths):
|
|
dataset.to_pickle(path)
|
|
|
|
|
|
|
|
def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
|
|
|
|
is_business, product_type, county_code = 1, 3, 0
|
|
|
|
|
|
datasets_info = [
|
|
[train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
|
|
[client, ['data_block_id'], ['date']],
|
|
[historical_weather, ['data_block_id'], ['datetime']],
|
|
[electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
|
|
[gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
|
|
]
|
|
|
|
for df, drop_cols, date_cols in datasets_info:
|
|
df.drop(drop_cols, axis=1, inplace=True)
|
|
for col in date_cols:
|
|
df[col] = pd.to_datetime(df[col])
|
|
|
|
|
|
historical_weather = add_county_column(historical_weather)
|
|
historical_weather = filter_estonian_counties(historical_weather)
|
|
|
|
historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()
|
|
|
|
|
|
train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
|
|
|
|
|
|
save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])
|
|
|
|
|