| """Optuna hyperparameter optimization for the salary prediction model.""" |
|
|
| import argparse |
| from pathlib import Path |
|
|
| import numpy as np |
| import optuna |
| import pandas as pd |
| import yaml |
| from sklearn.model_selection import KFold |
| from xgboost import XGBRegressor |
|
|
| from src.preprocessing import prepare_features |
| from src.train import ( |
| apply_cardinality_reduction, |
| drop_other_rows, |
| filter_salaries, |
| ) |
|
|
|
|
| def sample_params(trial: optuna.Trial, search_space: dict) -> dict: |
| """Sample hyperparameters from the search space using an Optuna trial. |
| |
| Args: |
| trial: Optuna trial object. |
| search_space: Dict mapping parameter names to their search config |
| (type, low, high, optional log). |
| |
| Returns: |
| Dict of sampled hyperparameter values. |
| """ |
| params = {} |
| for name, spec in search_space.items(): |
| param_type = spec["type"] |
| if param_type == "int": |
| params[name] = trial.suggest_int(name, spec["low"], spec["high"]) |
| elif param_type == "float": |
| log = spec.get("log", False) |
| params[name] = trial.suggest_float(name, spec["low"], spec["high"], log=log) |
| return params |
|
|
|
|
| def build_objective(X: pd.DataFrame, y: pd.Series, optuna_config: dict) -> callable: |
| """Build an Optuna objective function for XGBoost CV evaluation. |
| |
| Args: |
| X: Feature matrix. |
| y: Target vector. |
| optuna_config: Full optuna config dict with search_space, fixed, study. |
| |
| Returns: |
| Objective function that takes a trial and returns mean MAPE. |
| """ |
| search_space = optuna_config["search_space"] |
| fixed = optuna_config["fixed"] |
| cv_splits = optuna_config["study"]["cv_splits"] |
| random_state = fixed.get("random_state", 42) |
|
|
| def objective(trial: optuna.Trial) -> float: |
| params = sample_params(trial, search_space) |
| params.update(fixed) |
|
|
| kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state) |
| mape_scores = [] |
|
|
| for train_idx, test_idx in kf.split(X): |
| X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] |
| y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] |
|
|
| model = XGBRegressor(**params) |
| model.fit( |
| X_train, |
| y_train, |
| eval_set=[(X_test, y_test)], |
| verbose=False, |
| ) |
|
|
| preds = model.predict(X_test) |
| mape = np.mean(np.abs((y_test - preds) / y_test)) * 100 |
| mape_scores.append(mape) |
|
|
| return np.mean(mape_scores) |
|
|
| return objective |
|
|
|
|
| def save_best_params(best_params: dict, config_path: Path) -> None: |
| """Save the best hyperparameters to model_parameters.yaml. |
| |
| Updates the model: section with tuned values, preserving all other config. |
| |
| Args: |
| best_params: Dict of best hyperparameter values from Optuna. |
| config_path: Path to model_parameters.yaml. |
| """ |
| with open(config_path, "r") as f: |
| config = yaml.safe_load(f) |
|
|
| config["model"].update(best_params) |
|
|
| with open(config_path, "w") as f: |
| yaml.dump(config, f, default_flow_style=False, sort_keys=False) |
|
|
|
|
| def main(): |
| """Run Optuna hyperparameter optimization.""" |
| parser = argparse.ArgumentParser( |
| description="Optuna hyperparameter optimization for salary prediction" |
| ) |
| parser.add_argument( |
| "--n-trials", |
| type=int, |
| default=None, |
| help="Number of optimization trials (overrides config default)", |
| ) |
| args = parser.parse_args() |
|
|
| |
| optuna_config_path = Path("config/optuna_config.yaml") |
| with open(optuna_config_path, "r") as f: |
| optuna_config = yaml.safe_load(f) |
|
|
| model_config_path = Path("config/model_parameters.yaml") |
| with open(model_config_path, "r") as f: |
| config = yaml.safe_load(f) |
|
|
| n_trials = args.n_trials or optuna_config["study"]["n_trials"] |
|
|
| |
| print("Loading data...") |
| data_path = Path("data/survey_results_public.csv") |
| if not data_path.exists(): |
| print(f"Error: Data file not found at {data_path}") |
| print( |
| "Please download the Stack Overflow Developer Survey CSV " |
| "and place it in the data/ directory." |
| ) |
| return |
|
|
| df = pd.read_csv( |
| data_path, |
| usecols=[ |
| "Country", |
| "YearsCode", |
| "WorkExp", |
| "EdLevel", |
| "DevType", |
| "Industry", |
| "Age", |
| "ICorPM", |
| "OrgSize", |
| "Employment", |
| "Currency", |
| "CompTotal", |
| "ConvertedCompYearly", |
| ], |
| ) |
| print(f"Loaded {len(df):,} rows") |
|
|
| df = filter_salaries(df, config) |
| df = apply_cardinality_reduction(df) |
| df = drop_other_rows(df, config) |
|
|
| main_label = "ConvertedCompYearly" |
| X = prepare_features(df) |
| y = df[main_label] * config["data"]["salary_scale"] |
|
|
| print(f"Feature matrix shape: {X.shape}") |
| print(f"\nStarting Optuna optimization with {n_trials} trials...") |
|
|
| |
| objective = build_objective(X, y, optuna_config) |
| study = optuna.create_study( |
| direction=optuna_config["study"]["direction"], |
| ) |
| study.optimize(objective, n_trials=n_trials) |
|
|
| |
| print(f"\nBest trial: #{study.best_trial.number}") |
| print(f"Best MAPE: {study.best_value:.2f}%") |
| print("Best hyperparameters:") |
| for name, value in study.best_params.items(): |
| print(f" {name}: {value}") |
|
|
| |
| save_best_params(study.best_params, model_config_path) |
| print(f"\nBest parameters saved to {model_config_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|