| """Complete the training: RF tuning + Voting Ensemble + Save.""" |
| import os, sys |
| sys.path.insert(0, '/app/fraud_detection') |
| import numpy as np |
| import pandas as pd |
| import joblib |
| import optuna |
| optuna.logging.set_verbosity(optuna.logging.WARNING) |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| from sklearn.ensemble import RandomForestClassifier, VotingClassifier |
| from sklearn.metrics import roc_auc_score, average_precision_score |
| from config import DATA_DIR, MODELS_DIR, SEED |
|
|
| |
| data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) |
| X_train = data['X_train'] |
| X_val = data['X_val'] |
| y_train = data['y_train'] |
| y_val = data['y_val'] |
| class_weights = data['class_weights'] |
|
|
| |
| saved_models = joblib.load(os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) |
| print(f"Loaded {len(saved_models)} models: {list(saved_models.keys())}") |
|
|
| |
| need_rf_tune = 'Random_Forest_Tuned' not in saved_models |
| need_xgb_tune = 'XGBoost_Tuned' not in saved_models |
| need_lgbm_tune = 'LightGBM_Tuned' not in saved_models |
|
|
| print(f"Need RF tune: {need_rf_tune}, XGB tune: {need_xgb_tune}, LGBM tune: {need_lgbm_tune}") |
|
|
| |
| if need_rf_tune: |
| print("\n--- Quick Optuna RF Tuning (5 trials) ---") |
| def objective(trial): |
| params = { |
| 'n_estimators': trial.suggest_int('n_estimators', 100, 200), |
| 'max_depth': trial.suggest_int('max_depth', 8, 15), |
| 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), |
| 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5), |
| 'class_weight': class_weights, |
| 'random_state': SEED, |
| 'n_jobs': -1 |
| } |
| model = RandomForestClassifier(**params) |
| model.fit(X_train, y_train) |
| val_pred = model.predict_proba(X_val)[:, 1] |
| return average_precision_score(y_val, val_pred) |
|
|
| study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) |
| study.optimize(objective, n_trials=5, show_progress_bar=False) |
| print(f" Best PR-AUC: {study.best_value:.4f}") |
| print(f" Best params: {study.best_params}") |
|
|
| best_params = study.best_params |
| best_params['class_weight'] = class_weights |
| best_params['random_state'] = SEED |
| best_params['n_jobs'] = -1 |
| best_model = RandomForestClassifier(**best_params) |
| best_model.fit(X_train, y_train) |
| saved_models['Random_Forest_Tuned'] = best_model |
| |
| tuning_results = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) if os.path.exists(os.path.join(MODELS_DIR, "tuning_results.joblib")) else {} |
| tuning_results['random_forest'] = study.best_params |
| joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib")) |
|
|
| |
| if need_xgb_tune or need_lgbm_tune: |
| print("XGB/LGBM tuned models missing, re-running...") |
| import xgboost as xgb |
| import lightgbm as lgb |
| |
| if need_xgb_tune: |
| tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) |
| if 'xgboost' in tuning: |
| scale_pos_weight = class_weights[1] / class_weights[0] |
| bp = tuning['xgboost'] |
| bp['scale_pos_weight'] = scale_pos_weight |
| bp['random_state'] = SEED |
| bp['eval_metric'] = 'aucpr' |
| bp['n_jobs'] = -1 |
| bp['tree_method'] = 'hist' |
| m = xgb.XGBClassifier(**bp) |
| m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) |
| saved_models['XGBoost_Tuned'] = m |
| |
| if need_lgbm_tune: |
| tuning = joblib.load(os.path.join(MODELS_DIR, "tuning_results.joblib")) |
| if 'lightgbm' in tuning: |
| scale_pos_weight = class_weights[1] / class_weights[0] |
| bp = tuning['lightgbm'] |
| bp['scale_pos_weight'] = scale_pos_weight |
| bp['random_state'] = SEED |
| bp['n_jobs'] = -1 |
| bp['verbose'] = -1 |
| m = lgb.LGBMClassifier(**bp) |
| m.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| saved_models['LightGBM_Tuned'] = m |
|
|
| |
| if 'Voting_Ensemble' not in saved_models: |
| print("\n--- Creating Voting Ensemble ---") |
| ensemble_members = [] |
| for name in ['XGBoost_Tuned', 'LightGBM_Tuned', 'Random_Forest_Tuned']: |
| if name in saved_models: |
| ensemble_members.append((name, saved_models[name])) |
| |
| print(f" Members: {[n for n, _ in ensemble_members]}") |
| voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft') |
| voting_clf.fit(X_train, y_train) |
| saved_models['Voting_Ensemble'] = voting_clf |
| |
| val_pred = voting_clf.predict_proba(X_val)[:, 1] |
| val_auc = roc_auc_score(y_val, val_pred) |
| val_pr_auc = average_precision_score(y_val, val_pred) |
| print(f" Voting Ensemble Val ROC-AUC: {val_auc:.4f}, PR-AUC: {val_pr_auc:.4f}") |
|
|
| |
| joblib.dump(saved_models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) |
| save_models = {k: v for k, v in saved_models.items() if k != 'Autoencoder'} |
| joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib")) |
|
|
| print(f"\nFinal models saved: {list(saved_models.keys())}") |
| print("TRAINING COMPLETE") |
|
|