| import os |
| import json |
| import argparse |
| import numpy as np |
| import pandas as pd |
| import xgboost as xgb |
| from scipy import stats |
| from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve |
| from datasets import load_from_disk, DatasetDict |
|
|
| def best_f1_threshold(y_true, y_prob): |
| p, r, thr = precision_recall_curve(y_true, y_prob) |
| f1s = (2 * p[:-1] * r[:-1]) / (p[:-1] + r[:-1] + 1e-12) |
| i = int(np.nanargmax(f1s)) |
| return float(thr[i]), float(f1s[i]) |
|
|
|
|
| def bootstrap_ci( |
| y_true: np.ndarray, |
| y_prob: np.ndarray, |
| n_bootstrap: int = 2000, |
| ci: float = 0.95, |
| seed: int = 1986, |
| ) -> dict: |
| """ |
| Non-parametric bootstrap CI for F1 (at val-optimal threshold) and AUC. |
| Resamples (y_true, y_prob) pairs |
| """ |
| rng = np.random.default_rng(seed=seed) |
| n = len(y_true) |
|
|
| |
| thr, _ = best_f1_threshold(y_true, y_prob) |
|
|
| f1_scores, auc_scores = [], [] |
|
|
| for _ in range(n_bootstrap): |
| idx = rng.integers(0, n, size=n) |
| yt, yp = y_true[idx], y_prob[idx] |
|
|
| |
| if len(np.unique(yt)) < 2: |
| continue |
|
|
| f1_scores.append(f1_score(yt, (yp >= thr).astype(int), zero_division=0)) |
| auc_scores.append(roc_auc_score(yt, yp)) |
|
|
| alpha = 1 - ci |
| lo, hi = alpha / 2, 1 - alpha / 2 |
|
|
| results = {} |
| for name, arr in [("f1", f1_scores), ("auc", auc_scores)]: |
| arr = np.array(arr) |
| results[name] = { |
| "mean": float(arr.mean()), |
| "std": float(arr.std()), |
| "ci_low": float(np.quantile(arr, lo)), |
| "ci_high": float(np.quantile(arr, hi)), |
| "report": f"{arr.mean():.4f} [{np.quantile(arr, lo):.4f}, {np.quantile(arr, hi):.4f}]", |
| "n_bootstrap": len(arr), |
| } |
|
|
| results["threshold_used"] = float(thr) |
| results["n_samples"] = int(n) |
| return results |
|
|
| def prob_margin_uncertainty(val_preds_df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Uncertainty = distance from the decision boundary in probability space. |
| |
| |prob - 0.5| if = 0.0 means maximally uncertain, 0.5 means maximally confident. |
| Normalized to [0, 1]: confidence = 2 * |prob - 0.5| |
| This reflecting how far the model is from a coin-flip on given sequence. |
| """ |
| df = val_preds_df.copy() |
| df["uncertainty"] = 1 - 2 * (df["y_prob"] - 0.5).abs() |
| df["confidence"] = 1 - df["uncertainty"] |
| return df |
|
|
| def save_ci_report(ci_results: dict, out_dir: str, model_name: str = ""): |
| os.makedirs(out_dir, exist_ok=True) |
| path = os.path.join(out_dir, "bootstrap_ci.json") |
| with open(path, "w") as f: |
| json.dump(ci_results, f, indent=2) |
|
|
| print(f"\n=== Bootstrap 95% CI ({model_name}) ===") |
| print(f" F1 : {ci_results['f1']['report']}") |
| print(f" AUC : {ci_results['auc']['report']}") |
| print(f" (threshold={ci_results['threshold_used']:.4f}, " |
| f"n_bootstrap={ci_results['f1']['n_bootstrap']}, " |
| f"n_val={ci_results['n_samples']})") |
| print(f"Saved to {path}") |
|
|
|
|
| def save_uncertainty_csv(df: pd.DataFrame, out_dir: str, fname: str = "val_uncertainty.csv"): |
| os.makedirs(out_dir, exist_ok=True) |
| path = os.path.join(out_dir, fname) |
| df.to_csv(path, index=False) |
| print(f"\n=== Per-molecule uncertainty ===") |
| print(f" Mean uncertainty : {df['uncertainty'].mean():.4f}") |
| print(f" Mean confidence : {df['confidence'].mean():.4f}") |
| print(f" Saved to {path}") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--mode", choices=["ci", "uncertainty_xgb", "uncertainty_prob"], |
| required=True, |
| help=( |
| "ci : bootstrap CI from val_predictions.csv (all models)\n" |
| "uncertainty_prob : margin uncertainty for SVM/ElasticNet/XGB" |
| )) |
| parser.add_argument("--val_preds", type=str, help="Path to val_predictions.csv") |
| parser.add_argument("--model_path", type=str, help="Path to best_model.json (XGB only)") |
| parser.add_argument("--dataset_path", type=str, help="HuggingFace dataset path (XGB uncertainty only)") |
| parser.add_argument("--out_dir", type=str, required=True) |
| parser.add_argument("--model_name", type=str, default="", help="Label for report (xgb_smiles)") |
| parser.add_argument("--n_bootstrap", type=int, default=2000) |
| args = parser.parse_args() |
|
|
| if args.mode == "ci": |
| assert args.val_preds, "--val_preds required for ci mode" |
| df = pd.read_csv(args.val_preds) |
| ci = bootstrap_ci(df["y_true"].values, df["y_prob"].values, |
| n_bootstrap=args.n_bootstrap) |
| save_ci_report(ci, args.out_dir, args.model_name) |
| elif args.mode == "uncertainty_prob": |
| assert args.val_preds, "--val_preds required for uncertainty_prob" |
| df_preds = pd.read_csv(args.val_preds) |
| |
| ci = bootstrap_ci(df_preds["y_true"].values, df_preds["y_prob"].values, |
| n_bootstrap=args.n_bootstrap) |
| save_ci_report(ci, args.out_dir, args.model_name) |
| |
| df_unc = prob_margin_uncertainty(df_preds) |
| save_uncertainty_csv(df_unc, args.out_dir, "val_uncertainty_prob.csv") |