Spaces:
Sleeping
Sleeping
| """Data preprocessing utilities for consistent feature engineering.""" | |
| from pathlib import Path | |
| import pandas as pd | |
| import yaml | |
| # Load configuration once at module level | |
| _config_path = Path("config/model_parameters.yaml") | |
| with open(_config_path, "r") as f: | |
| _config = yaml.safe_load(f) | |
| def _get_other_category() -> str: | |
| """Get the standard 'Other' category name from config.""" | |
| return _config["features"]["cardinality"].get("other_category", "Other") | |
| def normalize_other_categories(series: pd.Series) -> pd.Series: | |
| """ | |
| Normalize variants of 'Other' to the standard category name. | |
| Replaces values like 'Other (please specify):', 'Other:', etc. | |
| with the standard 'Other' category from config. | |
| """ | |
| other_name = _get_other_category() | |
| return series.replace( | |
| to_replace=r"^Other\b.*$", | |
| value=other_name, | |
| regex=True, | |
| ) | |
| def reduce_cardinality( | |
| series: pd.Series, max_categories: int = None, min_frequency: int = None | |
| ) -> pd.Series: | |
| """ | |
| Reduce cardinality by grouping rare categories into 'Other'. | |
| Args: | |
| series: Pandas Series with categorical values | |
| max_categories: Maximum number of categories to keep | |
| (default: from config) | |
| min_frequency: Minimum occurrences for a category to be kept | |
| (default: from config) | |
| Returns: | |
| Series with rare categories replaced by 'Other' | |
| """ | |
| other_name = _get_other_category() | |
| # Use config defaults if not provided | |
| if max_categories is None: | |
| max_categories = _config["features"]["cardinality"]["max_categories"] | |
| if min_frequency is None: | |
| min_frequency = _config["features"]["cardinality"]["min_frequency"] | |
| # Normalize "Other" variants before counting frequencies | |
| series = normalize_other_categories(series) | |
| # Count value frequencies | |
| value_counts = series.value_counts() | |
| # Keep only categories that meet both criteria: | |
| # 1. In top max_categories by frequency | |
| # 2. Have at least min_frequency occurrences | |
| top_categories = value_counts.head(max_categories) | |
| kept_categories = top_categories[top_categories >= min_frequency].index.tolist() | |
| # Replace rare categories with the standard 'Other' name | |
| return series.apply(lambda x: x if x in kept_categories else other_name) | |
| def prepare_features(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Apply consistent feature transformations for both training and inference. | |
| This function ensures that the same preprocessing steps are applied | |
| during training and inference, preventing data leakage and inconsistencies. | |
| Args: | |
| df: DataFrame with columns: Country, YearsCode, WorkExp, EdLevel, | |
| DevType, Industry, Age, ICorPM, OrgSize, Employment. | |
| NOTE: During training, cardinality reduction should be applied to df | |
| BEFORE calling this function. During inference, valid_categories.yaml | |
| ensures only valid (already-reduced) categories are used. | |
| Returns: | |
| DataFrame with one-hot encoded features ready for model input | |
| Note: | |
| - Fills missing values with defaults (0 for numeric, "Unknown" for categorical) | |
| - Normalizes Unicode apostrophes to regular apostrophes | |
| - Applies one-hot encoding with drop_first=True to avoid multicollinearity | |
| - Column names in output will be like: YearsCode, WorkExp, Country_X, EdLevel_Y, DevType_Z, Industry_W, Age_V, ICorPM_U | |
| - Does NOT apply cardinality reduction (must be done before calling this) | |
| """ | |
| # Create a copy to avoid modifying the original | |
| df_processed = df.copy() | |
| # Normalize Unicode apostrophes to regular apostrophes for consistency | |
| # This handles cases where data has \u2019 (') instead of ' | |
| _categorical_cols = [ | |
| "Country", | |
| "EdLevel", | |
| "DevType", | |
| "Industry", | |
| "Age", | |
| "ICorPM", | |
| "OrgSize", | |
| "Employment", | |
| ] | |
| for col in _categorical_cols: | |
| if col in df_processed.columns: | |
| df_processed[col] = df_processed[col].str.replace( | |
| "\u2019", "'", regex=False | |
| ) | |
| # Normalize "Other" category variants (e.g. "Other (please specify):" -> "Other") | |
| for col in _categorical_cols: | |
| if col in df_processed.columns: | |
| df_processed[col] = normalize_other_categories(df_processed[col]) | |
| # Handle legacy column name (YearsCodePro -> YearsCode) | |
| if ( | |
| "YearsCodePro" in df_processed.columns | |
| and "YearsCode" not in df_processed.columns | |
| ): | |
| df_processed.rename(columns={"YearsCodePro": "YearsCode"}, inplace=True) | |
| # Fill missing values with defaults | |
| df_processed["YearsCode"] = df_processed["YearsCode"].fillna(0) | |
| df_processed["WorkExp"] = df_processed["WorkExp"].fillna(0) | |
| df_processed["Country"] = df_processed["Country"].fillna("Unknown") | |
| df_processed["EdLevel"] = df_processed["EdLevel"].fillna("Unknown") | |
| df_processed["DevType"] = df_processed["DevType"].fillna("Unknown") | |
| df_processed["Industry"] = df_processed["Industry"].fillna("Unknown") | |
| df_processed["Age"] = df_processed["Age"].fillna("Unknown") | |
| df_processed["ICorPM"] = df_processed["ICorPM"].fillna("Unknown") | |
| df_processed["OrgSize"] = df_processed["OrgSize"].fillna("Unknown") | |
| df_processed["Employment"] = df_processed["Employment"].fillna("Unknown") | |
| # NOTE: Cardinality reduction is NOT applied here | |
| # It should be applied during training BEFORE calling this function | |
| # During inference, valid_categories.yaml ensures only valid values are used | |
| # Select only the features we need | |
| feature_cols = [ | |
| "Country", | |
| "YearsCode", | |
| "WorkExp", | |
| "EdLevel", | |
| "DevType", | |
| "Industry", | |
| "Age", | |
| "ICorPM", | |
| "OrgSize", | |
| "Employment", | |
| ] | |
| df_features = df_processed[feature_cols] | |
| # Apply one-hot encoding for categorical variables | |
| # For inference (single rows), we need drop_first=False to create columns | |
| # The reindex in infer.py will align with training columns | |
| # For training (many rows), we use the config value | |
| is_inference = len(df_features) == 1 | |
| drop_first = ( | |
| False if is_inference else _config["features"]["encoding"]["drop_first"] | |
| ) | |
| df_encoded = pd.get_dummies(df_features, drop_first=drop_first) | |
| return df_encoded | |