File size: 5,276 Bytes
115745a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
from geopy.geocoders import Nominatim
import sys
import os


def initialize_geolocator(user_agent="county_locator"):
    """Initialize the geolocator object."""
    return Nominatim(user_agent=user_agent)


def get_county_from_coordinates(latitude, longitude, geolocator):
    """Retrieve county name from coordinates using the geolocator."""
    location = geolocator.reverse((latitude, longitude), language="en")
    if location:
        return location.raw.get("address", {}).get("county", "Unknown")
    return "Unknown"


def add_county_column(df):
    """Add county column to DataFrame based on coordinates."""
    geolocator = initialize_geolocator()
    coordinates = df[['longitude', 'latitude']].drop_duplicates()
    coordinates['county'] = coordinates.apply(
        lambda row: get_county_from_coordinates(row['latitude'], row['longitude'], geolocator), axis=1
    )
    df = df.merge(coordinates[['latitude', 'longitude', 'county']], on=['latitude', 'longitude'])
    df = df.drop(['longitude', 'latitude'], axis=1)
    return df


def filter_estonian_counties(df):
    """Filter rows by Estonian counties and map county names to integers."""
    county_locations = [
        'Saare County', 'Võru County', 'Pärnu County', 'Valga County', 'Viljandi County', 'Tartu County',
        'Põlva County', 'Jõgeva County', 'Hiiu County', 'Lääne County', 'Rapla County', 'Järva County',
        'Harju County', 'Lääne-Viru County', 'Ida-Viru County'
    ]
    county_to_int = {
        'Saare County': 10, 'Võru County': 15, 'Pärnu County': 7, 'Valga County': 13, 'Viljandi County': 14,
        'Tartu County': 11, 'Põlva County': 8, 'Jõgeva County': 4, 'Hiiu County': 1, 'Lääne County': 6,
        'Rapla County': 9, 'Järva County': 3, 'Harju County': 0, 'Lääne-Viru County': 5, 'Ida-Viru County': 2
    }
    df = df[df['county'].isin(county_locations)]
    df.loc[:, 'county'] = df['county'].map(county_to_int)
    return df


def filter_data(train, client, weather, is_business, product_type, county_code):
    """Filter and split train data based on is_business, product_type, county_code and is_consumption."""
    train = train[
        (train['is_business'] == is_business) &
        (train['product_type'] == product_type) &
        (train['county'] == county_code)
    ]
    train = train.drop(['is_business', 'product_type', 'county'], axis=1)
    train = train[train['is_consumption'] == 0]; train = train.drop(['is_consumption'], axis=1)
    

    client = client[
        (client['is_business'] == is_business) &
        (client['product_type'] == product_type) &
        (client['county'] == county_code)
    ]
    client = client.drop(['is_business', 'product_type', 'county'], axis=1)

    weather = weather[weather['county'] == county_code]
    weather = weather.drop(['county'], axis=1)

    return train, client, weather

def save_datasets_to_pickle(datasets, paths=None):
    """Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
    # Obtén el directorio del archivo actual
    base_dir = os.path.dirname(os.path.abspath(__file__))

    if paths is None:
        paths = [
            os.path.join(base_dir, 'process_files', 'generation.pkl'),
            os.path.join(base_dir, 'process_files', 'client.pkl'),
            os.path.join(base_dir, 'process_files', 'historical_weather.pkl'),
            os.path.join(base_dir, 'process_files', 'electricity_prices.pkl'),
            os.path.join(base_dir, 'process_files', 'gas_prices.pkl'),
        ]

    # Guardar cada dataset en su respectiva ruta
    for dataset, path in zip(datasets, paths):
        dataset.to_pickle(path)



def filter_datasets(train,client,historical_weather,electricity_prices,gas_prices):
    # Filter parameters
    is_business, product_type, county_code = 1, 3, 0

    # Drop unnecessary columns and change date columns to datetime type
    datasets_info = [
        [train, ['data_block_id', 'row_id', 'prediction_unit_id'], ['datetime']],
        [client, ['data_block_id'], ['date']],
        [historical_weather, ['data_block_id'], ['datetime']],
        [electricity_prices, ['data_block_id', 'origin_date'], ['forecast_date']],
        [gas_prices, ['data_block_id', 'origin_date'], ['forecast_date']]
    ] # [df, [drop_cols], [date_cols]]

    for df, drop_cols, date_cols in datasets_info:
        df.drop(drop_cols, axis=1, inplace=True)
        for col in date_cols:
            df[col] = pd.to_datetime(df[col])

    # Add county and filter weather data 
    historical_weather = add_county_column(historical_weather)
    historical_weather = filter_estonian_counties(historical_weather)
    # Group weather data by day
    historical_weather = historical_weather.groupby(['county', 'datetime']).agg('mean').reset_index()

    # Filter data by is_business, product_type, county_code
    train, client, historical_weather = filter_data(train, client, historical_weather, is_business, product_type, county_code)
    
    # Save datasets to pickle files
    save_datasets_to_pickle([train, client, historical_weather, electricity_prices, gas_prices])