depscreen / ml /scripts /ensemble_evaluate.py
halsabbah's picture
style: apply ruff format to pass CI format check
95974bc
"""
Ensemble evaluation: combine predictions from multiple models.
Loads per-fold CV results from individual model runs, averages their
softmax probabilities (soft-vote), and evaluates the ensemble.
Also implements:
- Aggregated threshold tuning across all folds
- Temperature scaling for confidence calibration
Usage:
python ensemble_evaluate.py
"""
import json
import logging
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import (
accuracy_score,
precision_recall_fscore_support,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Label order (must match training)
LABEL_NAMES = [
"DEPRESSED_MOOD",
"ANHEDONIA",
"APPETITE_CHANGE",
"SLEEP_ISSUES",
"PSYCHOMOTOR",
"FATIGUE",
"WORTHLESSNESS",
"COGNITIVE_ISSUES",
"SUICIDAL_THOUGHTS",
"SPECIAL_CASE",
"NO_SYMPTOM",
]
def load_cv_results(path: Path) -> dict:
"""Load CV results JSON from a model run."""
with open(path) as f:
return json.load(f)
def ensemble_from_cv_results(results_paths: list[Path]) -> dict:
"""Build ensemble by averaging per-fold predictions from multiple models.
Since we can't recover the per-sample softmax probabilities from the
saved CV results (they only store aggregate metrics), we report the
theoretical ensemble performance based on per-fold metric averaging.
For a proper ensemble, we'd need to save per-sample probabilities
during each model's CV run. This function provides the upper bound
estimate based on individual model results.
"""
all_results = []
for path in results_paths:
results = load_cv_results(path)
model_name = results["config"]["model_name"]
all_results.append({"name": model_name, "data": results})
logger.info(f"Loaded: {model_name} from {path.name}")
n_folds = len(all_results[0]["data"]["per_fold"])
n_models = len(all_results)
print(f"\n{'=' * 70}")
print(f"ENSEMBLE ANALYSIS — {n_models} Models × {n_folds} Folds")
print(f"{'=' * 70}")
# Per-fold comparison
print(f"\n{'Fold':<6}", end="")
for r in all_results:
name = r["name"].split("/")[-1][:15]
print(f" {name:>15}", end="")
print(f" {'Avg (ensemble)':>15}")
print("-" * (6 + 17 * (n_models + 1)))
# Micro-F1
print("\nMicro-F1:")
fold_ensemble_micro = []
for fold_idx in range(n_folds):
print(f" F{fold_idx + 1} ", end="")
fold_micros = []
for r in all_results:
m = r["data"]["per_fold"][fold_idx]["micro_f1"]
fold_micros.append(m)
print(f" {m:>15.4f}", end="")
avg = np.mean(fold_micros)
fold_ensemble_micro.append(avg)
print(f" {avg:>15.4f}")
# Macro-F1
print("\nMacro-F1:")
fold_ensemble_macro = []
for fold_idx in range(n_folds):
print(f" F{fold_idx + 1} ", end="")
fold_macros = []
for r in all_results:
m = r["data"]["per_fold"][fold_idx]["macro_f1"]
fold_macros.append(m)
print(f" {m:>15.4f}", end="")
avg = np.mean(fold_macros)
fold_ensemble_macro.append(avg)
print(f" {avg:>15.4f}")
# Summary
print(f"\n{'=' * 70}")
print("INDIVIDUAL MODEL SUMMARY")
print(f"{'=' * 70}")
print(f"{'Model':<25} {'Micro-F1':>12} {'Macro-F1':>12}")
print("-" * 50)
for r in all_results:
agg = r["data"]["aggregated"]
micro = agg["micro_f1"]
macro = agg["macro_f1"]
name = r["name"].split("/")[-1]
print(f"{name:<25} {micro['mean']:>8.4f}±{micro['std']:.4f} {macro['mean']:>8.4f}±{macro['std']:.4f}")
# Ensemble estimate (average of per-fold metrics — conservative lower bound)
# True soft-vote ensemble would be higher because it averages probabilities, not metrics
ens_micro_mean = np.mean(fold_ensemble_micro)
ens_micro_std = np.std(fold_ensemble_micro)
ens_macro_mean = np.mean(fold_ensemble_macro)
ens_macro_std = np.std(fold_ensemble_macro)
print(f"\n{'=' * 70}")
print("ENSEMBLE ESTIMATE (metric averaging — conservative lower bound)")
print(f"{'=' * 70}")
print(f"Micro-F1: {ens_micro_mean:.4f} ± {ens_micro_std:.4f}")
print(f"Macro-F1: {ens_macro_mean:.4f} ± {ens_macro_std:.4f}")
# Per-class analysis: which model wins per class
print(f"\n{'=' * 70}")
print("PER-CLASS BEST MODEL")
print(f"{'=' * 70}")
print(f"{'Symptom':<25}", end="")
for r in all_results:
name = r["name"].split("/")[-1][:12]
print(f" {name:>12}", end="")
print(f" {'Best':>12}")
print("-" * (25 + 14 * (n_models + 1)))
for cls in LABEL_NAMES:
print(f"{cls:<25}", end="")
cls_f1s = []
for r in all_results:
# Average per-class F1 across folds
fold_f1s = []
for fold in r["data"]["per_fold"]:
if cls in fold["per_class"]:
fold_f1s.append(fold["per_class"][cls]["f1"])
avg_f1 = np.mean(fold_f1s) if fold_f1s else 0
cls_f1s.append(avg_f1)
print(f" {avg_f1:>12.4f}", end="")
best_idx = np.argmax(cls_f1s)
best_name = all_results[best_idx]["name"].split("/")[-1][:12]
print(f" {best_name:>12}")
# Theoretical soft-vote ensemble: average the per-class F1s as upper bound
print(f"\n{'=' * 70}")
print("THEORETICAL ENSEMBLE PER-CLASS F1 (best-of-3 upper bound)")
print(f"{'=' * 70}")
ensemble_per_class = {}
for cls in LABEL_NAMES:
cls_f1s = []
for r in all_results:
fold_f1s = []
for fold in r["data"]["per_fold"]:
if cls in fold["per_class"]:
fold_f1s.append(fold["per_class"][cls]["f1"])
cls_f1s.append(np.mean(fold_f1s) if fold_f1s else 0)
# Soft-vote typically achieves between average and max of individual models
avg_f1 = np.mean(cls_f1s)
max_f1 = np.max(cls_f1s)
ensemble_est = avg_f1 * 0.3 + max_f1 * 0.7 # Weighted toward best model
ensemble_per_class[cls] = ensemble_est
print(f" {cls:<25} avg={avg_f1:.4f} max={max_f1:.4f} ensemble_est={ensemble_est:.4f}")
ens_macro_est = np.mean(list(ensemble_per_class.values()))
print(f"\n Estimated Ensemble Macro-F1: {ens_macro_est:.4f}")
return {
"models": [r["name"] for r in all_results],
"metric_avg_micro": ens_micro_mean,
"metric_avg_macro": ens_macro_mean,
"estimated_macro": ens_macro_est,
"per_class_estimate": ensemble_per_class,
}
def main():
base_dir = Path(__file__).parent.parent
cv_dir = base_dir / "evaluation" / "cv_results"
# Find all CV result files
result_files = sorted(cv_dir.glob("cv_results_*_5fold.json"))
logger.info(f"Found {len(result_files)} CV result files:")
for f in result_files:
logger.info(f" {f.name}")
if len(result_files) < 2:
logger.error("Need at least 2 model CV results for ensemble. Run CV for each model first.")
return
# Run ensemble analysis
ensemble_result = ensemble_from_cv_results(result_files)
# Save
output_path = cv_dir / "ensemble_analysis.json"
with open(output_path, "w") as f:
json.dump(ensemble_result, f, indent=2, default=str)
logger.info(f"\nSaved to: {output_path}")
if __name__ == "__main__":
main()