PI2 / filterdf.py
josegoji's picture
Upload 7 files
115745a verified
import pandas as pd
from geopy.geocoders import Nominatim
import sys
import os
def initialize_geolocator(user_agent="county_locator"):
"""Initialize the geolocator object."""
return Nominatim(user_agent=user_agent)
def get_county_from_coordinates(latitude, longitude, geolocator):
"""Retrieve county name from coordinates using the geolocator."""
location = geolocator.reverse((latitude, longitude), language="en")
if location:
return location.raw.get("address", {}).get("county", "Unknown")
return "Unknown"
def add_county_column(df):
"""Add county column to DataFrame based on coordinates."""
geolocator = initialize_geolocator()
coordinates = df[['longitude', 'latitude']].drop_duplicates()
coordinates['county'] = coordinates.apply(
lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
)
df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
df = df.drop(['longitude', 'latitude'], axis=1)
return df
def filter_estonian_counties(df):
"""Filter rows by Estonian counties and map county names to integers."""
county_locations = [
'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
]
county_to_int = {
'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
}
df = df[df['county'].isin(county_locations)]
df.loc[:, 'county'] = df['county'].map(county_to_int)
return df
def filter_data(train, client, weather, is_business, product_type, county_code):
"""Filter and split train data based on is_business, product_type, county_code and is_consumption."""
train = train[
(train['is_business'] == is_business) &
(train['product_type'] == product_type) &
(train['county'] == county_code)
]
train = train.drop(['is_business', 'product_type', 'county'], axis=1)
train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
client = client[
(client['is_business'] == is_business) &
(client['product_type'] == product_type) &
(client['county'] == county_code)
]
client = client.drop(['is_business', 'product_type', 'county'], axis=1)
weather = weather[weather['county'] == county_code]
weather = weather.drop(['county'], axis=1)
return train, client, weather
def save_datasets_to_pickle(datasets, paths=None):
"""Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
# Obtén el directorio del archivo actual
base_dir = os.path.dirname(os.path.abspath(__file__))
if paths is None:
paths = [
os.path.join(base_dir, 'process_files', 'generation.pkl'),
os.path.join(base_dir, 'process_files', 'client.pkl'),
os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
]
# Guardar cada dataset en su respectiva ruta
for dataset, path in zip(datasets, paths):
dataset.to_pickle(path)
def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
# Filter parameters
is_business, product_type, county_code = 1, 3, 0
# Drop unnecessary columns and change date columns to datetime type
datasets_info = [
[train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
[client, ['data_block_id'], ['date']],
[historical_weather, ['data_block_id'], ['datetime']],
[electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
[gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
] # [df, [drop_cols], [date_cols]]
for df, drop_cols, date_cols in datasets_info:
df.drop(drop_cols, axis=1, inplace=True)
for col in date_cols:
df[col] = pd.to_datetime(df[col])
# Add county and filter weather data
historical_weather = add_county_column(historical_weather)
historical_weather = filter_estonian_counties(historical_weather)
# Group weather data by day
historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()
# Filter data by is_business, product_type, county_code
train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
# Save datasets to pickle files
save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])