import pandas as pd from sqlalchemy import create_engine # Download the zip file and extract it beforehand CSV_PATH = 'school_geolocation_measurements/measurements.csv' def generate_df_stats(data_frame: pd.DataFrame) -> pd.DataFrame: """ Generate and return a new df containing the keys stats of the given df. """ _ = data_frame.describe(include='all') _.loc['dtype'] = data_frame.dtypes _.loc['size'] = len(data_frame) _.loc['null count'] = data_frame.isnull().sum() return _ def resample_daily_with_aggregation(data_frame: pd.DataFrame) -> pd.DataFrame: """ Resample the input DataFrame to daily frequency and aggregate values, correctly handling missing dates and school information. :param data_frame: The input data. :return: A DataFrame with resampled values. """ # Ensure 'date' is datetime and set as index data_frame['date'] = pd.to_datetime(data_frame['date']) data_frame.set_index('date', inplace=True) resampled_data = [] # Iterate only on unique school IDs for school_id, school_name in data_frame[ ['school_id_giga', 'school_name'] ].drop_duplicates().itertuples(index=False): school_df = data_frame[ (data_frame['school_id_giga'] == school_id) & (data_frame['school_name'] == school_name) ].copy() # Resample to daily frequency and aggregate resampled_school_df = school_df.resample('D').agg({ 'download_speed': 'mean', 'upload_speed': 'mean', 'latency': 'mean' }) # Add back school_id_giga and school_name resampled_school_df['school_id_giga'] = school_id resampled_school_df['school_name'] = school_name # Get the first non-null values for 'server_location', 'country', and 'iso3_format' non_null_values = school_df[['server_location', 'country', 'iso3_format']].dropna().iloc[0] resampled_school_df['server_location'] = non_null_values['server_location'] resampled_school_df['country'] = non_null_values['country'] resampled_school_df['iso3_format'] = non_null_values['iso3_format'] resampled_data.append(resampled_school_df) resampled_df = pd.concat(resampled_data).reset_index() return resampled_df # Load the data df = pd.read_csv(CSV_PATH) df = df.drop( columns=[ 'school_id_govt', 'detected_isp', 'timestamp', 'detected_isp_asn', 'app_version', 'source' ] ) df['date'] = pd.to_datetime(df['date']) df.sort_values(by=['school_id_giga', 'date'], inplace=True) # Get the value counts for each school ID value_counts = df['school_id_giga'].value_counts() print(value_counts) # Remove rows with school IDs that are rare min_count_threshold = 120 df_filtered = df[ df['school_id_giga'].isin(value_counts[value_counts >= min_count_threshold].index) ] print(f'{df_filtered.shape=}') print(df_filtered['school_id_giga'].value_counts()) # Resample data to daily frequency df_daily = resample_daily_with_aggregation(df_filtered.copy()) print(df_daily.head()) # Imputation with median (simple, generally robust to outliers, often works) df_daily['download_speed'] = df_daily['download_speed'].fillna(df_daily['download_speed'].median()) df_daily['upload_speed'] = df_daily['upload_speed'].fillna(df_daily['upload_speed'].median()) df_daily['latency'] = df_daily['latency'].fillna(df_daily['latency'].median()) # Export to CSV and SQLite db engine = create_engine('sqlite:///resampled_daily_avg.sqlite', echo=False) df_daily.to_sql(name='school_measurements', con=engine) df_daily.to_csv('resampled_daily_avg.csv', index=False)