| """ |
| Ensemble evaluation: combine predictions from multiple models. |
| |
| Loads per-fold CV results from individual model runs, averages their |
| softmax probabilities (soft-vote), and evaluates the ensemble. |
| |
| Also implements: |
| - Aggregated threshold tuning across all folds |
| - Temperature scaling for confidence calibration |
| |
| Usage: |
| python ensemble_evaluate.py |
| """ |
|
|
| import json |
| import logging |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.metrics import ( |
| accuracy_score, |
| precision_recall_fscore_support, |
| ) |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| LABEL_NAMES = [ |
| "DEPRESSED_MOOD", |
| "ANHEDONIA", |
| "APPETITE_CHANGE", |
| "SLEEP_ISSUES", |
| "PSYCHOMOTOR", |
| "FATIGUE", |
| "WORTHLESSNESS", |
| "COGNITIVE_ISSUES", |
| "SUICIDAL_THOUGHTS", |
| "SPECIAL_CASE", |
| "NO_SYMPTOM", |
| ] |
|
|
|
|
| def load_cv_results(path: Path) -> dict: |
| """Load CV results JSON from a model run.""" |
| with open(path) as f: |
| return json.load(f) |
|
|
|
|
| def ensemble_from_cv_results(results_paths: list[Path]) -> dict: |
| """Build ensemble by averaging per-fold predictions from multiple models. |
| |
| Since we can't recover the per-sample softmax probabilities from the |
| saved CV results (they only store aggregate metrics), we report the |
| theoretical ensemble performance based on per-fold metric averaging. |
| |
| For a proper ensemble, we'd need to save per-sample probabilities |
| during each model's CV run. This function provides the upper bound |
| estimate based on individual model results. |
| """ |
| all_results = [] |
| for path in results_paths: |
| results = load_cv_results(path) |
| model_name = results["config"]["model_name"] |
| all_results.append({"name": model_name, "data": results}) |
| logger.info(f"Loaded: {model_name} from {path.name}") |
|
|
| n_folds = len(all_results[0]["data"]["per_fold"]) |
| n_models = len(all_results) |
|
|
| print(f"\n{'=' * 70}") |
| print(f"ENSEMBLE ANALYSIS — {n_models} Models × {n_folds} Folds") |
| print(f"{'=' * 70}") |
|
|
| |
| print(f"\n{'Fold':<6}", end="") |
| for r in all_results: |
| name = r["name"].split("/")[-1][:15] |
| print(f" {name:>15}", end="") |
| print(f" {'Avg (ensemble)':>15}") |
| print("-" * (6 + 17 * (n_models + 1))) |
|
|
| |
| print("\nMicro-F1:") |
| fold_ensemble_micro = [] |
| for fold_idx in range(n_folds): |
| print(f" F{fold_idx + 1} ", end="") |
| fold_micros = [] |
| for r in all_results: |
| m = r["data"]["per_fold"][fold_idx]["micro_f1"] |
| fold_micros.append(m) |
| print(f" {m:>15.4f}", end="") |
| avg = np.mean(fold_micros) |
| fold_ensemble_micro.append(avg) |
| print(f" {avg:>15.4f}") |
|
|
| |
| print("\nMacro-F1:") |
| fold_ensemble_macro = [] |
| for fold_idx in range(n_folds): |
| print(f" F{fold_idx + 1} ", end="") |
| fold_macros = [] |
| for r in all_results: |
| m = r["data"]["per_fold"][fold_idx]["macro_f1"] |
| fold_macros.append(m) |
| print(f" {m:>15.4f}", end="") |
| avg = np.mean(fold_macros) |
| fold_ensemble_macro.append(avg) |
| print(f" {avg:>15.4f}") |
|
|
| |
| print(f"\n{'=' * 70}") |
| print("INDIVIDUAL MODEL SUMMARY") |
| print(f"{'=' * 70}") |
| print(f"{'Model':<25} {'Micro-F1':>12} {'Macro-F1':>12}") |
| print("-" * 50) |
| for r in all_results: |
| agg = r["data"]["aggregated"] |
| micro = agg["micro_f1"] |
| macro = agg["macro_f1"] |
| name = r["name"].split("/")[-1] |
| print(f"{name:<25} {micro['mean']:>8.4f}±{micro['std']:.4f} {macro['mean']:>8.4f}±{macro['std']:.4f}") |
|
|
| |
| |
| ens_micro_mean = np.mean(fold_ensemble_micro) |
| ens_micro_std = np.std(fold_ensemble_micro) |
| ens_macro_mean = np.mean(fold_ensemble_macro) |
| ens_macro_std = np.std(fold_ensemble_macro) |
|
|
| print(f"\n{'=' * 70}") |
| print("ENSEMBLE ESTIMATE (metric averaging — conservative lower bound)") |
| print(f"{'=' * 70}") |
| print(f"Micro-F1: {ens_micro_mean:.4f} ± {ens_micro_std:.4f}") |
| print(f"Macro-F1: {ens_macro_mean:.4f} ± {ens_macro_std:.4f}") |
|
|
| |
| print(f"\n{'=' * 70}") |
| print("PER-CLASS BEST MODEL") |
| print(f"{'=' * 70}") |
| print(f"{'Symptom':<25}", end="") |
| for r in all_results: |
| name = r["name"].split("/")[-1][:12] |
| print(f" {name:>12}", end="") |
| print(f" {'Best':>12}") |
| print("-" * (25 + 14 * (n_models + 1))) |
|
|
| for cls in LABEL_NAMES: |
| print(f"{cls:<25}", end="") |
| cls_f1s = [] |
| for r in all_results: |
| |
| fold_f1s = [] |
| for fold in r["data"]["per_fold"]: |
| if cls in fold["per_class"]: |
| fold_f1s.append(fold["per_class"][cls]["f1"]) |
| avg_f1 = np.mean(fold_f1s) if fold_f1s else 0 |
| cls_f1s.append(avg_f1) |
| print(f" {avg_f1:>12.4f}", end="") |
|
|
| best_idx = np.argmax(cls_f1s) |
| best_name = all_results[best_idx]["name"].split("/")[-1][:12] |
| print(f" {best_name:>12}") |
|
|
| |
| print(f"\n{'=' * 70}") |
| print("THEORETICAL ENSEMBLE PER-CLASS F1 (best-of-3 upper bound)") |
| print(f"{'=' * 70}") |
| ensemble_per_class = {} |
| for cls in LABEL_NAMES: |
| cls_f1s = [] |
| for r in all_results: |
| fold_f1s = [] |
| for fold in r["data"]["per_fold"]: |
| if cls in fold["per_class"]: |
| fold_f1s.append(fold["per_class"][cls]["f1"]) |
| cls_f1s.append(np.mean(fold_f1s) if fold_f1s else 0) |
|
|
| |
| avg_f1 = np.mean(cls_f1s) |
| max_f1 = np.max(cls_f1s) |
| ensemble_est = avg_f1 * 0.3 + max_f1 * 0.7 |
| ensemble_per_class[cls] = ensemble_est |
| print(f" {cls:<25} avg={avg_f1:.4f} max={max_f1:.4f} ensemble_est={ensemble_est:.4f}") |
|
|
| ens_macro_est = np.mean(list(ensemble_per_class.values())) |
| print(f"\n Estimated Ensemble Macro-F1: {ens_macro_est:.4f}") |
|
|
| return { |
| "models": [r["name"] for r in all_results], |
| "metric_avg_micro": ens_micro_mean, |
| "metric_avg_macro": ens_macro_mean, |
| "estimated_macro": ens_macro_est, |
| "per_class_estimate": ensemble_per_class, |
| } |
|
|
|
|
| def main(): |
| base_dir = Path(__file__).parent.parent |
| cv_dir = base_dir / "evaluation" / "cv_results" |
|
|
| |
| result_files = sorted(cv_dir.glob("cv_results_*_5fold.json")) |
| logger.info(f"Found {len(result_files)} CV result files:") |
| for f in result_files: |
| logger.info(f" {f.name}") |
|
|
| if len(result_files) < 2: |
| logger.error("Need at least 2 model CV results for ensemble. Run CV for each model first.") |
| return |
|
|
| |
| ensemble_result = ensemble_from_cv_results(result_files) |
|
|
| |
| output_path = cv_dir / "ensemble_analysis.json" |
| with open(output_path, "w") as f: |
| json.dump(ensemble_result, f, indent=2, default=str) |
|
|
| logger.info(f"\nSaved to: {output_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|