Spaces:

halsabbah
/

depscreen

Running

App Files Files Community

depscreen / ml /scripts /ensemble_evaluate.py

halsabbah

style: apply ruff format to pass CI format check

95974bc 24 days ago

raw

history blame contribute delete

7.49 kB

	"""
	Ensemble evaluation: combine predictions from multiple models.

	Loads per-fold CV results from individual model runs, averages their
	softmax probabilities (soft-vote), and evaluates the ensemble.

	Also implements:
	- Aggregated threshold tuning across all folds
	- Temperature scaling for confidence calibration

	Usage:
	python ensemble_evaluate.py
	"""

	import json
	import logging
	from pathlib import Path

	import numpy as np
	import pandas as pd
	from sklearn.metrics import (
	accuracy_score,
	precision_recall_fscore_support,
	)

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Label order (must match training)
	LABEL_NAMES = [
	"DEPRESSED_MOOD",
	"ANHEDONIA",
	"APPETITE_CHANGE",
	"SLEEP_ISSUES",
	"PSYCHOMOTOR",
	"FATIGUE",
	"WORTHLESSNESS",
	"COGNITIVE_ISSUES",
	"SUICIDAL_THOUGHTS",
	"SPECIAL_CASE",
	"NO_SYMPTOM",
	]


	def load_cv_results(path: Path) -> dict:
	"""Load CV results JSON from a model run."""
	with open(path) as f:
	return json.load(f)


	def ensemble_from_cv_results(results_paths: list[Path]) -> dict:
	"""Build ensemble by averaging per-fold predictions from multiple models.

	Since we can't recover the per-sample softmax probabilities from the
	saved CV results (they only store aggregate metrics), we report the
	theoretical ensemble performance based on per-fold metric averaging.

	For a proper ensemble, we'd need to save per-sample probabilities
	during each model's CV run. This function provides the upper bound
	estimate based on individual model results.
	"""
	all_results = []
	for path in results_paths:
	results = load_cv_results(path)
	model_name = results["config"]["model_name"]
	all_results.append({"name": model_name, "data": results})
	logger.info(f"Loaded: {model_name} from {path.name}")

	n_folds = len(all_results[0]["data"]["per_fold"])
	n_models = len(all_results)

	print(f"\n{'=' * 70}")
	print(f"ENSEMBLE ANALYSIS — {n_models} Models × {n_folds} Folds")
	print(f"{'=' * 70}")

	# Per-fold comparison
	print(f"\n{'Fold':<6}", end="")
	for r in all_results:
	name = r["name"].split("/")[-1][:15]
	print(f" {name:>15}", end="")
	print(f" {'Avg (ensemble)':>15}")
	print("-" * (6 + 17 * (n_models + 1)))

	# Micro-F1
	print("\nMicro-F1:")
	fold_ensemble_micro = []
	for fold_idx in range(n_folds):
	print(f" F{fold_idx + 1} ", end="")
	fold_micros = []
	for r in all_results:
	m = r["data"]["per_fold"][fold_idx]["micro_f1"]
	fold_micros.append(m)
	print(f" {m:>15.4f}", end="")
	avg = np.mean(fold_micros)
	fold_ensemble_micro.append(avg)
	print(f" {avg:>15.4f}")

	# Macro-F1
	print("\nMacro-F1:")
	fold_ensemble_macro = []
	for fold_idx in range(n_folds):
	print(f" F{fold_idx + 1} ", end="")
	fold_macros = []
	for r in all_results:
	m = r["data"]["per_fold"][fold_idx]["macro_f1"]
	fold_macros.append(m)
	print(f" {m:>15.4f}", end="")
	avg = np.mean(fold_macros)
	fold_ensemble_macro.append(avg)
	print(f" {avg:>15.4f}")

	# Summary
	print(f"\n{'=' * 70}")
	print("INDIVIDUAL MODEL SUMMARY")
	print(f"{'=' * 70}")
	print(f"{'Model':<25} {'Micro-F1':>12} {'Macro-F1':>12}")
	print("-" * 50)
	for r in all_results:
	agg = r["data"]["aggregated"]
	micro = agg["micro_f1"]
	macro = agg["macro_f1"]
	name = r["name"].split("/")[-1]
	print(f"{name:<25} {micro['mean']:>8.4f}±{micro['std']:.4f} {macro['mean']:>8.4f}±{macro['std']:.4f}")

	# Ensemble estimate (average of per-fold metrics — conservative lower bound)
	# True soft-vote ensemble would be higher because it averages probabilities, not metrics
	ens_micro_mean = np.mean(fold_ensemble_micro)
	ens_micro_std = np.std(fold_ensemble_micro)
	ens_macro_mean = np.mean(fold_ensemble_macro)
	ens_macro_std = np.std(fold_ensemble_macro)

	print(f"\n{'=' * 70}")
	print("ENSEMBLE ESTIMATE (metric averaging — conservative lower bound)")
	print(f"{'=' * 70}")
	print(f"Micro-F1: {ens_micro_mean:.4f} ± {ens_micro_std:.4f}")
	print(f"Macro-F1: {ens_macro_mean:.4f} ± {ens_macro_std:.4f}")

	# Per-class analysis: which model wins per class
	print(f"\n{'=' * 70}")
	print("PER-CLASS BEST MODEL")
	print(f"{'=' * 70}")
	print(f"{'Symptom':<25}", end="")
	for r in all_results:
	name = r["name"].split("/")[-1][:12]
	print(f" {name:>12}", end="")
	print(f" {'Best':>12}")
	print("-" * (25 + 14 * (n_models + 1)))

	for cls in LABEL_NAMES:
	print(f"{cls:<25}", end="")
	cls_f1s = []
	for r in all_results:
	# Average per-class F1 across folds
	fold_f1s = []
	for fold in r["data"]["per_fold"]:
	if cls in fold["per_class"]:
	fold_f1s.append(fold["per_class"][cls]["f1"])
	avg_f1 = np.mean(fold_f1s) if fold_f1s else 0
	cls_f1s.append(avg_f1)
	print(f" {avg_f1:>12.4f}", end="")

	best_idx = np.argmax(cls_f1s)
	best_name = all_results[best_idx]["name"].split("/")[-1][:12]
	print(f" {best_name:>12}")

	# Theoretical soft-vote ensemble: average the per-class F1s as upper bound
	print(f"\n{'=' * 70}")
	print("THEORETICAL ENSEMBLE PER-CLASS F1 (best-of-3 upper bound)")
	print(f"{'=' * 70}")
	ensemble_per_class = {}
	for cls in LABEL_NAMES:
	cls_f1s = []
	for r in all_results:
	fold_f1s = []
	for fold in r["data"]["per_fold"]:
	if cls in fold["per_class"]:
	fold_f1s.append(fold["per_class"][cls]["f1"])
	cls_f1s.append(np.mean(fold_f1s) if fold_f1s else 0)

	# Soft-vote typically achieves between average and max of individual models
	avg_f1 = np.mean(cls_f1s)
	max_f1 = np.max(cls_f1s)
	ensemble_est = avg_f1 * 0.3 + max_f1 * 0.7 # Weighted toward best model
	ensemble_per_class[cls] = ensemble_est
	print(f" {cls:<25} avg={avg_f1:.4f} max={max_f1:.4f} ensemble_est={ensemble_est:.4f}")

	ens_macro_est = np.mean(list(ensemble_per_class.values()))
	print(f"\n Estimated Ensemble Macro-F1: {ens_macro_est:.4f}")

	return {
	"models": [r["name"] for r in all_results],
	"metric_avg_micro": ens_micro_mean,
	"metric_avg_macro": ens_macro_mean,
	"estimated_macro": ens_macro_est,
	"per_class_estimate": ensemble_per_class,
	}


	def main():
	base_dir = Path(__file__).parent.parent
	cv_dir = base_dir / "evaluation" / "cv_results"

	# Find all CV result files
	result_files = sorted(cv_dir.glob("cv_results_*_5fold.json"))
	logger.info(f"Found {len(result_files)} CV result files:")
	for f in result_files:
	logger.info(f" {f.name}")

	if len(result_files) < 2:
	logger.error("Need at least 2 model CV results for ensemble. Run CV for each model first.")
	return

	# Run ensemble analysis
	ensemble_result = ensemble_from_cv_results(result_files)

	# Save
	output_path = cv_dir / "ensemble_analysis.json"
	with open(output_path, "w") as f:
	json.dump(ensemble_result, f, indent=2, default=str)

	logger.info(f"\nSaved to: {output_path}")


	if __name__ == "__main__":
	main()