| """ |
| Module 3: Model Training - Optimized for speed |
| Train all models: LR, RF, XGBoost, LightGBM, MLP, Autoencoder, Voting Ensemble. |
| Hyperparameter tuning with Optuna. |
| """ |
| import os, sys |
| sys.path.insert(0, '/app/fraud_detection') |
| import numpy as np |
| import pandas as pd |
| import joblib |
| import optuna |
| optuna.logging.set_verbosity(optuna.logging.WARNING) |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| from sklearn.linear_model import LogisticRegression |
| from sklearn.ensemble import RandomForestClassifier, VotingClassifier |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.metrics import f1_score, roc_auc_score, average_precision_score |
| import xgboost as xgb |
| import lightgbm as lgb |
|
|
| from config import DATA_DIR, MODELS_DIR, SEED |
|
|
| |
| data = joblib.load(os.path.join(DATA_DIR, "processed_data.joblib")) |
| X_train = data['X_train'] |
| X_val = data['X_val'] |
| X_test = data['X_test'] |
| y_train = data['y_train'] |
| y_val = data['y_val'] |
| y_test = data['y_test'] |
| X_train_smote = data['X_train_smote'] |
| y_train_smote = data['y_train_smote'] |
| class_weights = data['class_weights'] |
| scale_pos_weight = class_weights[1] / class_weights[0] |
|
|
| print(f"Data loaded. Train: {X_train.shape}, Val: {X_val.shape}") |
|
|
| models = {} |
|
|
| |
| print("\n[1/8] Logistic Regression...") |
| lr = LogisticRegression(class_weight=class_weights, max_iter=1000, random_state=SEED, C=0.1, solver='lbfgs') |
| lr.fit(X_train, y_train) |
| models['Logistic_Regression'] = lr |
| p = lr.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| print("\n[2/8] Random Forest...") |
| rf = RandomForestClassifier(n_estimators=150, max_depth=12, class_weight=class_weights, random_state=SEED, n_jobs=-1) |
| rf.fit(X_train, y_train) |
| models['Random_Forest'] = rf |
| p = rf.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| print("\n[3/8] XGBoost...") |
| xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') |
| xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) |
| models['XGBoost'] = xgb_model |
| p = xgb_model.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| print("\n[4/8] LightGBM...") |
| lgbm_model = lgb.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.05, scale_pos_weight=scale_pos_weight, subsample=0.8, colsample_bytree=0.8, random_state=SEED, n_jobs=-1, verbose=-1) |
| lgbm_model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| models['LightGBM'] = lgbm_model |
| p = lgbm_model.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| print("\n[5/8] MLP Neural Network...") |
| mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', alpha=0.001, batch_size=256, learning_rate='adaptive', max_iter=200, random_state=SEED, early_stopping=True, n_iter_no_change=10) |
| mlp.fit(X_train_smote, y_train_smote) |
| models['MLP'] = mlp |
| p = mlp.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| print("\n[6/8] Autoencoder...") |
| import torch |
| import torch.nn as nn |
| from torch.utils.data import DataLoader, TensorDataset |
|
|
| X_train_legit = X_train[y_train == 0] |
| X_train_np = X_train_legit.values if isinstance(X_train_legit, pd.DataFrame) else X_train_legit |
| input_dim = X_train_np.shape[1] |
|
|
| class Autoencoder(nn.Module): |
| def __init__(self, d): |
| super().__init__() |
| self.encoder = nn.Sequential(nn.Linear(d, 64), nn.ReLU(), nn.Dropout(0.2), nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU()) |
| self.decoder = nn.Sequential(nn.Linear(16, 32), nn.ReLU(), nn.Dropout(0.2), nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, d)) |
| def forward(self, x): |
| return self.decoder(self.encoder(x)) |
|
|
| ae_model = Autoencoder(input_dim) |
| criterion = nn.MSELoss() |
| optimizer = torch.optim.Adam(ae_model.parameters(), lr=0.001, weight_decay=1e-5) |
| train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_np), torch.FloatTensor(X_train_np)), batch_size=256, shuffle=True) |
|
|
| ae_model.train() |
| for epoch in range(50): |
| eloss = 0 |
| for bx, _ in train_loader: |
| optimizer.zero_grad() |
| out = ae_model(bx) |
| loss = criterion(out, bx) |
| loss.backward() |
| optimizer.step() |
| eloss += loss.item() |
| if (epoch+1) % 10 == 0: |
| print(f" Epoch {epoch+1}/50, Loss: {eloss/len(train_loader):.6f}") |
|
|
| ae_model.eval() |
| X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else X_val |
| with torch.no_grad(): |
| val_out = ae_model(torch.FloatTensor(X_val_np)) |
| recon_error = torch.mean((val_out - torch.FloatTensor(X_val_np))**2, dim=1).numpy() |
| print(f" ROC-AUC: {roc_auc_score(y_val, recon_error):.4f}, PR-AUC: {average_precision_score(y_val, recon_error):.4f}") |
|
|
| |
| class AutoencoderWrapper: |
| def __init__(self, model): |
| self.model = model |
| self.classes_ = np.array([0, 1]) |
| def predict_proba(self, X): |
| self.model.eval() |
| Xn = X.values if isinstance(X, pd.DataFrame) else X |
| with torch.no_grad(): |
| Xt = torch.FloatTensor(Xn) |
| out = self.model(Xt) |
| re = torch.mean((out - Xt)**2, dim=1).numpy() |
| scores = 1 / (1 + np.exp(-10 * (re - np.median(re)))) |
| return np.column_stack([1-scores, scores]) |
| def predict(self, X, threshold=0.5): |
| return (self.predict_proba(X)[:, 1] >= threshold).astype(int) |
|
|
| models['Autoencoder'] = AutoencoderWrapper(ae_model) |
| torch.save(ae_model.state_dict(), os.path.join(MODELS_DIR, "autoencoder.pt")) |
|
|
| |
| print("\n[7/8] Optuna Tuning...") |
|
|
| |
| print(" Tuning XGBoost (15 trials)...") |
| def xgb_obj(trial): |
| m = xgb.XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 250), max_depth=trial.suggest_int('max_depth', 4, 9), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), min_child_weight=trial.suggest_int('mcw', 1, 8), scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') |
| m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) |
| return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) |
|
|
| s = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) |
| s.optimize(xgb_obj, n_trials=15) |
| print(f" Best PR-AUC: {s.best_value:.4f}") |
| bp = s.best_params |
| xgb_best = xgb.XGBClassifier(n_estimators=bp['n_estimators'], max_depth=bp['max_depth'], learning_rate=bp['lr'], subsample=bp['ss'], colsample_bytree=bp['csb'], reg_alpha=bp['ra'], reg_lambda=bp['rl'], min_child_weight=bp['mcw'], scale_pos_weight=scale_pos_weight, random_state=SEED, eval_metric='aucpr', n_jobs=-1, tree_method='hist') |
| xgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) |
| models['XGBoost_Tuned'] = xgb_best |
| xgb_tune_params = s.best_params |
|
|
| |
| print(" Tuning LightGBM (15 trials)...") |
| def lgb_obj(trial): |
| m = lgb.LGBMClassifier(n_estimators=trial.suggest_int('n_estimators', 100, 300), max_depth=trial.suggest_int('max_depth', 4, 10), learning_rate=trial.suggest_float('lr', 0.01, 0.3, log=True), subsample=trial.suggest_float('ss', 0.6, 1.0), colsample_bytree=trial.suggest_float('csb', 0.6, 1.0), reg_alpha=trial.suggest_float('ra', 1e-4, 10, log=True), reg_lambda=trial.suggest_float('rl', 1e-4, 10, log=True), num_leaves=trial.suggest_int('nl', 15, 100), scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1) |
| m.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) |
|
|
| s2 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) |
| s2.optimize(lgb_obj, n_trials=15) |
| print(f" Best PR-AUC: {s2.best_value:.4f}") |
| bp2 = s2.best_params |
| lgb_best = lgb.LGBMClassifier(n_estimators=bp2['n_estimators'], max_depth=bp2['max_depth'], learning_rate=bp2['lr'], subsample=bp2['ss'], colsample_bytree=bp2['csb'], reg_alpha=bp2['ra'], reg_lambda=bp2['rl'], num_leaves=bp2['nl'], scale_pos_weight=scale_pos_weight, random_state=SEED, n_jobs=-1, verbose=-1) |
| lgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| models['LightGBM_Tuned'] = lgb_best |
| lgb_tune_params = s2.best_params |
|
|
| |
| print(" Tuning Random Forest (5 trials)...") |
| def rf_obj(trial): |
| m = RandomForestClassifier(n_estimators=trial.suggest_int('ne', 100, 200), max_depth=trial.suggest_int('md', 8, 15), min_samples_split=trial.suggest_int('mss', 2, 10), min_samples_leaf=trial.suggest_int('msl', 1, 5), class_weight=class_weights, random_state=SEED, n_jobs=-1) |
| m.fit(X_train, y_train) |
| return average_precision_score(y_val, m.predict_proba(X_val)[:, 1]) |
|
|
| s3 = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED)) |
| s3.optimize(rf_obj, n_trials=5) |
| print(f" Best PR-AUC: {s3.best_value:.4f}") |
| bp3 = s3.best_params |
| rf_best = RandomForestClassifier(n_estimators=bp3['ne'], max_depth=bp3['md'], min_samples_split=bp3['mss'], min_samples_leaf=bp3['msl'], class_weight=class_weights, random_state=SEED, n_jobs=-1) |
| rf_best.fit(X_train, y_train) |
| models['Random_Forest_Tuned'] = rf_best |
| rf_tune_params = s3.best_params |
|
|
| tuning_results = {'xgboost': xgb_tune_params, 'lightgbm': lgb_tune_params, 'random_forest': rf_tune_params} |
| joblib.dump(tuning_results, os.path.join(MODELS_DIR, "tuning_results.joblib")) |
|
|
| |
| print("\n[8/8] Voting Ensemble...") |
| ensemble_members = [('XGBoost_Tuned', models['XGBoost_Tuned']), ('LightGBM_Tuned', models['LightGBM_Tuned']), ('Random_Forest_Tuned', models['Random_Forest_Tuned'])] |
| voting_clf = VotingClassifier(estimators=ensemble_members, voting='soft') |
| voting_clf.fit(X_train, y_train) |
| models['Voting_Ensemble'] = voting_clf |
| p = voting_clf.predict_proba(X_val)[:, 1] |
| print(f" ROC-AUC: {roc_auc_score(y_val, p):.4f}, PR-AUC: {average_precision_score(y_val, p):.4f}") |
|
|
| |
| joblib.dump(models, os.path.join(MODELS_DIR, "all_models_with_ae.joblib")) |
| save_models = {k: v for k, v in models.items() if k != 'Autoencoder'} |
| joblib.dump(save_models, os.path.join(MODELS_DIR, "all_models.joblib")) |
|
|
| print(f"\n=== ALL TRAINING COMPLETE ===") |
| print(f"Models: {list(models.keys())}") |
|
|