ynuozhang

major update

04c2975 5 days ago

5.41 kB

	import os
	import json
	import argparse
	import numpy as np
	import pandas as pd
	import xgboost as xgb
	from scipy import stats
	from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve
	from datasets import load_from_disk, DatasetDict

	def best_f1_threshold(y_true, y_prob):
	p, r, thr = precision_recall_curve(y_true, y_prob)
	f1s = (2 * p[:-1] * r[:-1]) / (p[:-1] + r[:-1] + 1e-12)
	i = int(np.nanargmax(f1s))
	return float(thr[i]), float(f1s[i])


	def bootstrap_ci(
	y_true: np.ndarray,
	y_prob: np.ndarray,
	n_bootstrap: int = 2000,
	ci: float = 0.95,
	seed: int = 1986,
	) -> dict:
	"""
	Non-parametric bootstrap CI for F1 (at val-optimal threshold) and AUC.
	Resamples (y_true, y_prob) pairs
	"""
	rng = np.random.default_rng(seed=seed)
	n = len(y_true)

	# Threshold picked on the full val set
	thr, _ = best_f1_threshold(y_true, y_prob)

	f1_scores, auc_scores = [], []

	for _ in range(n_bootstrap):
	idx = rng.integers(0, n, size=n)
	yt, yp = y_true[idx], y_prob[idx]

	# Skip degenerate bootstraps (only one class)
	if len(np.unique(yt)) < 2:
	continue

	f1_scores.append(f1_score(yt, (yp >= thr).astype(int), zero_division=0))
	auc_scores.append(roc_auc_score(yt, yp))

	alpha = 1 - ci
	lo, hi = alpha / 2, 1 - alpha / 2

	results = {}
	for name, arr in [("f1", f1_scores), ("auc", auc_scores)]:
	arr = np.array(arr)
	results[name] = {
	"mean": float(arr.mean()),
	"std": float(arr.std()),
	"ci_low": float(np.quantile(arr, lo)),
	"ci_high": float(np.quantile(arr, hi)),
	"report": f"{arr.mean():.4f} [{np.quantile(arr, lo):.4f}, {np.quantile(arr, hi):.4f}]",
	"n_bootstrap": len(arr),
	}

	results["threshold_used"] = float(thr)
	results["n_samples"] = int(n)
	return results

	def prob_margin_uncertainty(val_preds_df: pd.DataFrame) -> pd.DataFrame:
	"""
	Uncertainty = distance from the decision boundary in probability space.

	\|prob - 0.5\| if = 0.0 means maximally uncertain, 0.5 means maximally confident.
	Normalized to [0, 1]: confidence = 2 * \|prob - 0.5\|
	This reflecting how far the model is from a coin-flip on given sequence.
	"""
	df = val_preds_df.copy()
	df["uncertainty"] = 1 - 2 * (df["y_prob"] - 0.5).abs() # 0=confident, 1=uncertain
	df["confidence"] = 1 - df["uncertainty"] # 0=uncertain, 1=confident
	return df

	def save_ci_report(ci_results: dict, out_dir: str, model_name: str = ""):
	os.makedirs(out_dir, exist_ok=True)
	path = os.path.join(out_dir, "bootstrap_ci.json")
	with open(path, "w") as f:
	json.dump(ci_results, f, indent=2)

	print(f"\n=== Bootstrap 95% CI ({model_name}) ===")
	print(f" F1 : {ci_results['f1']['report']}")
	print(f" AUC : {ci_results['auc']['report']}")
	print(f" (threshold={ci_results['threshold_used']:.4f}, "
	f"n_bootstrap={ci_results['f1']['n_bootstrap']}, "
	f"n_val={ci_results['n_samples']})")
	print(f"Saved to {path}")


	def save_uncertainty_csv(df: pd.DataFrame, out_dir: str, fname: str = "val_uncertainty.csv"):
	os.makedirs(out_dir, exist_ok=True)
	path = os.path.join(out_dir, fname)
	df.to_csv(path, index=False)
	print(f"\n=== Per-molecule uncertainty ===")
	print(f" Mean uncertainty : {df['uncertainty'].mean():.4f}")
	print(f" Mean confidence : {df['confidence'].mean():.4f}")
	print(f" Saved to {path}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--mode", choices=["ci", "uncertainty_xgb", "uncertainty_prob"],
	required=True,
	help=(
	"ci : bootstrap CI from val_predictions.csv (all models)\n"
	"uncertainty_prob : margin uncertainty for SVM/ElasticNet/XGB"
	))
	parser.add_argument("--val_preds", type=str, help="Path to val_predictions.csv")
	parser.add_argument("--model_path", type=str, help="Path to best_model.json (XGB only)")
	parser.add_argument("--dataset_path", type=str, help="HuggingFace dataset path (XGB uncertainty only)")
	parser.add_argument("--out_dir", type=str, required=True)
	parser.add_argument("--model_name", type=str, default="", help="Label for report (xgb_smiles)")
	parser.add_argument("--n_bootstrap", type=int, default=2000)
	args = parser.parse_args()

	if args.mode == "ci":
	assert args.val_preds, "--val_preds required for ci mode"
	df = pd.read_csv(args.val_preds)
	ci = bootstrap_ci(df["y_true"].values, df["y_prob"].values,
	n_bootstrap=args.n_bootstrap)
	save_ci_report(ci, args.out_dir, args.model_name)
	elif args.mode == "uncertainty_prob":
	assert args.val_preds, "--val_preds required for uncertainty_prob"
	df_preds = pd.read_csv(args.val_preds)
	# CI
	ci = bootstrap_ci(df_preds["y_true"].values, df_preds["y_prob"].values,
	n_bootstrap=args.n_bootstrap)
	save_ci_report(ci, args.out_dir, args.model_name)
	# Uncertainty from margin
	df_unc = prob_margin_uncertainty(df_preds)
	save_uncertainty_csv(df_unc, args.out_dir, "val_uncertainty_prob.csv")