Spaces:
Sleeping
Sleeping
| """ | |
| Data ingestion and cleaning for the Major Power Outage dataset. | |
| Assumptions | |
| ----------- | |
| * Source: Kaggle "Major Power Outage Risks in the U.S." (Purdue / DOE). | |
| * The CSV is pre-exported; if using the original .xlsx you may need to | |
| adjust `SKIP_ROWS_IN_RAW` in config.py. | |
| * Rows with missing CUSTOMERS.AFFECTED *and* missing OUTAGE.DURATION are | |
| dropped because the target cannot be computed. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| from src.config import ( | |
| CUSTOMERS_AFFECTED_COL, | |
| IMPACT_QUANTILE, | |
| OUTAGE_DURATION_COL, | |
| RAW_DATA_PATH, | |
| SKIP_ROWS_IN_RAW, | |
| TARGET_COL, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def load_raw(path: Path | str | None = None, skip_rows: int = SKIP_ROWS_IN_RAW) -> pd.DataFrame: | |
| """Read the outage CSV into a DataFrame, handling common formatting issues.""" | |
| path = Path(path) if path else RAW_DATA_PATH | |
| if not path.exists(): | |
| raise FileNotFoundError( | |
| f"Dataset not found at {path}. Download from Kaggle and place it there." | |
| ) | |
| suffix = path.suffix.lower() | |
| if suffix in (".xlsx", ".xls"): | |
| df = pd.read_excel(path, skiprows=skip_rows) | |
| else: | |
| # Try reading with skiprows first; fall back to 0 if it blows up. | |
| try: | |
| df = pd.read_csv(path, skiprows=skip_rows) | |
| except Exception: | |
| df = pd.read_csv(path) | |
| # Strip whitespace from column names | |
| df.columns = df.columns.str.strip() | |
| # Drop fully empty rows / columns (common artifact of Excel exports) | |
| df.dropna(how="all", axis=0, inplace=True) | |
| df.dropna(how="all", axis=1, inplace=True) | |
| logger.info("Loaded %d rows, %d columns from %s", len(df), len(df.columns), path) | |
| return df | |
| def clean(df: pd.DataFrame) -> pd.DataFrame: | |
| """Basic type coercion and null handling.""" | |
| df = df.copy() | |
| # The first row often contains units (e.g. "Megawatt", "mins"). Drop it! | |
| if df["YEAR"].iloc[0] == "Year" or df["ANOMALY.LEVEL"].iloc[0] == "numeric": | |
| df = df.iloc[1:].reset_index(drop=True) | |
| # Coerce numeric columns that may have been read as object | |
| from src.config import NUMERIC_FEATURES | |
| # We also include extra columns used for feature engineering that might not be in NUMERIC_FEATURES | |
| num_cols = list(set(NUMERIC_FEATURES + [ | |
| CUSTOMERS_AFFECTED_COL, OUTAGE_DURATION_COL, "DEMAND.LOSS.MW", | |
| "TOTAL.CUSTOMERS", "RES.SALES", "TOTAL.SALES", "RES.PRICE", "IND.PRICE", | |
| "POPDEN_URBAN", "POPDEN_RURAL" | |
| ])) | |
| for col in num_cols: | |
| if col in df.columns: | |
| if df[col].dtype == object and df[col].astype(str).str.contains(',').any(): | |
| df[col] = df[col].astype(str).str.replace(',', '') | |
| df[col] = pd.to_numeric(df[col], errors="coerce") | |
| # Parse MONTH as string to avoid scikit-learn imputer mixed-type errors | |
| if "MONTH" in df.columns: | |
| df["MONTH"] = pd.to_numeric(df["MONTH"], errors="coerce").fillna(-1).astype(int).astype(str) | |
| df.loc[df["MONTH"] == "-1", "MONTH"] = np.nan | |
| # Drop rows where we cannot define the target at all | |
| target_deps = [c for c in [CUSTOMERS_AFFECTED_COL, OUTAGE_DURATION_COL] if c in df.columns] | |
| df.dropna(subset=target_deps, how="all", inplace=True) | |
| df.reset_index(drop=True, inplace=True) | |
| logger.info("After cleaning: %d rows remain", len(df)) | |
| return df | |
| def build_target(df: pd.DataFrame, quantile: float = IMPACT_QUANTILE) -> pd.DataFrame: | |
| """ | |
| Define binary target: **high_impact**. | |
| An outage qualifies as high-impact when: | |
| customers_affected >= Q(quantile) AND duration >= Q(quantile) | |
| This deliberately selects the upper-right quadrant of severity, which | |
| is the segment an operations center would triage first. | |
| When either column is missing for a row, we fall back to the single | |
| available column so we don't lose too many samples. | |
| """ | |
| df = df.copy() | |
| cust_thresh = df[CUSTOMERS_AFFECTED_COL].quantile(quantile) | |
| dur_thresh = df[OUTAGE_DURATION_COL].quantile(quantile) | |
| has_cust = df[CUSTOMERS_AFFECTED_COL].notna() | |
| has_dur = df[OUTAGE_DURATION_COL].notna() | |
| high_cust = df[CUSTOMERS_AFFECTED_COL] >= cust_thresh | |
| high_dur = df[OUTAGE_DURATION_COL] >= dur_thresh | |
| # Both present β require both; only one present β use that one | |
| target = pd.Series(0, index=df.index) | |
| both = has_cust & has_dur | |
| target.loc[both] = ((high_cust & high_dur) & both).astype(int).loc[both] | |
| target.loc[has_cust & ~has_dur] = high_cust.astype(int).loc[has_cust & ~has_dur] | |
| target.loc[~has_cust & has_dur] = high_dur.astype(int).loc[~has_cust & has_dur] | |
| df[TARGET_COL] = target.astype(int) | |
| pos_rate = df[TARGET_COL].mean() | |
| logger.info( | |
| "Target built β positive rate: %.2f%% (%d / %d) | thresholds: customers>=%.0f, duration>=%.0f min", | |
| pos_rate * 100, | |
| df[TARGET_COL].sum(), | |
| len(df), | |
| cust_thresh, | |
| dur_thresh, | |
| ) | |
| return df | |
| def get_dataset(path: Path | str | None = None) -> pd.DataFrame: | |
| """End-to-end: load β clean β build target.""" | |
| df = load_raw(path) | |
| df = clean(df) | |
| df = build_target(df) | |
| return df | |