Spaces:

dima806
/

developer_salary_prediction

Running

App Files Files Community

developer_salary_prediction / src /tune.py

dima806

Upload 39 files

eeeaee6 verified 2 months ago

raw

history blame contribute delete

5.76 kB

	"""Optuna hyperparameter optimization for the salary prediction model."""

	import argparse
	from pathlib import Path

	import numpy as np
	import optuna
	import pandas as pd
	import yaml
	from sklearn.model_selection import KFold
	from xgboost import XGBRegressor

	from src.preprocessing import prepare_features
	from src.train import (
	apply_cardinality_reduction,
	drop_other_rows,
	filter_salaries,
	)


	def sample_params(trial: optuna.Trial, search_space: dict) -> dict:
	"""Sample hyperparameters from the search space using an Optuna trial.

	Args:
	trial: Optuna trial object.
	search_space: Dict mapping parameter names to their search config
	(type, low, high, optional log).

	Returns:
	Dict of sampled hyperparameter values.
	"""
	params = {}
	for name, spec in search_space.items():
	param_type = spec["type"]
	if param_type == "int":
	params[name] = trial.suggest_int(name, spec["low"], spec["high"])
	elif param_type == "float":
	log = spec.get("log", False)
	params[name] = trial.suggest_float(name, spec["low"], spec["high"], log=log)
	return params


	def build_objective(X: pd.DataFrame, y: pd.Series, optuna_config: dict) -> callable:
	"""Build an Optuna objective function for XGBoost CV evaluation.

	Args:
	X: Feature matrix.
	y: Target vector.
	optuna_config: Full optuna config dict with search_space, fixed, study.

	Returns:
	Objective function that takes a trial and returns mean MAPE.
	"""
	search_space = optuna_config["search_space"]
	fixed = optuna_config["fixed"]
	cv_splits = optuna_config["study"]["cv_splits"]
	random_state = fixed.get("random_state", 42)

	def objective(trial: optuna.Trial) -> float:
	params = sample_params(trial, search_space)
	params.update(fixed)

	kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
	mape_scores = []

	for train_idx, test_idx in kf.split(X):
	X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
	y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

	model = XGBRegressor(**params)
	model.fit(
	X_train,
	y_train,
	eval_set=[(X_test, y_test)],
	verbose=False,
	)

	preds = model.predict(X_test)
	mape = np.mean(np.abs((y_test - preds) / y_test)) * 100
	mape_scores.append(mape)

	return np.mean(mape_scores)

	return objective


	def save_best_params(best_params: dict, config_path: Path) -> None:
	"""Save the best hyperparameters to model_parameters.yaml.

	Updates the model: section with tuned values, preserving all other config.

	Args:
	best_params: Dict of best hyperparameter values from Optuna.
	config_path: Path to model_parameters.yaml.
	"""
	with open(config_path, "r") as f:
	config = yaml.safe_load(f)

	config["model"].update(best_params)

	with open(config_path, "w") as f:
	yaml.dump(config, f, default_flow_style=False, sort_keys=False)


	def main():
	"""Run Optuna hyperparameter optimization."""
	parser = argparse.ArgumentParser(
	description="Optuna hyperparameter optimization for salary prediction"
	)
	parser.add_argument(
	"--n-trials",
	type=int,
	default=None,
	help="Number of optimization trials (overrides config default)",
	)
	args = parser.parse_args()

	# Load configs
	optuna_config_path = Path("config/optuna_config.yaml")
	with open(optuna_config_path, "r") as f:
	optuna_config = yaml.safe_load(f)

	model_config_path = Path("config/model_parameters.yaml")
	with open(model_config_path, "r") as f:
	config = yaml.safe_load(f)

	n_trials = args.n_trials or optuna_config["study"]["n_trials"]

	# Load and preprocess data
	print("Loading data...")
	data_path = Path("data/survey_results_public.csv")
	if not data_path.exists():
	print(f"Error: Data file not found at {data_path}")
	print(
	"Please download the Stack Overflow Developer Survey CSV "
	"and place it in the data/ directory."
	)
	return

	df = pd.read_csv(
	data_path,
	usecols=[
	"Country",
	"YearsCode",
	"WorkExp",
	"EdLevel",
	"DevType",
	"Industry",
	"Age",
	"ICorPM",
	"OrgSize",
	"Employment",
	"Currency",
	"CompTotal",
	"ConvertedCompYearly",
	],
	)
	print(f"Loaded {len(df):,} rows")

	df = filter_salaries(df, config)
	df = apply_cardinality_reduction(df)
	df = drop_other_rows(df, config)

	main_label = "ConvertedCompYearly"
	X = prepare_features(df)
	y = df[main_label] * config["data"]["salary_scale"]

	print(f"Feature matrix shape: {X.shape}")
	print(f"\nStarting Optuna optimization with {n_trials} trials...")

	# Run optimization
	objective = build_objective(X, y, optuna_config)
	study = optuna.create_study(
	direction=optuna_config["study"]["direction"],
	)
	study.optimize(objective, n_trials=n_trials)

	# Report results
	print(f"\nBest trial: #{study.best_trial.number}")
	print(f"Best MAPE: {study.best_value:.2f}%")
	print("Best hyperparameters:")
	for name, value in study.best_params.items():
	print(f" {name}: {value}")

	# Save best params to model_parameters.yaml
	save_best_params(study.best_params, model_config_path)
	print(f"\nBest parameters saved to {model_config_path}")


	if __name__ == "__main__":
	main()