| import numpy as np |
| import scipy.stats as stats |
| import matplotlib.pyplot as plt |
| import math |
| from sklearn.metrics import roc_curve, roc_auc_score |
| from typing import Tuple, Optional |
| from sklearn.preprocessing import label_binarize |
|
|
| |
| |
|
|
| |
| |
| def compute_midrank( |
| x: np.ndarray |
| ) -> np.ndarray: |
| """Computes midranks. |
| Args: |
| x - a 1D numpy array |
| Returns: |
| array of midranks |
| """ |
| J = np.argsort(x) |
| Z = x[J] |
| N = len(x) |
| T = np.zeros(N, dtype=float) |
| i = 0 |
| while i < N: |
| j = i |
| while j < N and Z[j] == Z[i]: |
| j += 1 |
| T[i:j] = 0.5*(i + j - 1) |
| i = j |
| T2 = np.empty(N, dtype=float) |
| |
| |
| T2[J] = T + 1 |
| return T2 |
|
|
|
|
| def compute_midrank_weight( |
| x: np.ndarray, |
| sample_weight: np.ndarray |
| ) -> np.ndarray: |
| """Computes midranks. |
| Args: |
| x - a 1D numpy array |
| Returns: |
| array of midranks |
| """ |
| J = np.argsort(x) |
| Z = x[J] |
| cumulative_weight = np.cumsum(sample_weight[J]) |
| N = len(x) |
| T = np.zeros(N, dtype=float) |
| i = 0 |
| while i < N: |
| j = i |
| while j < N and Z[j] == Z[i]: |
| j += 1 |
| T[i:j] = cumulative_weight[i:j].mean() |
| i = j |
| T2 = np.empty(N, dtype=float) |
| T2[J] = T |
| return T2 |
|
|
|
|
| def fastDeLong( |
| predictions_sorted_transposed: np.ndarray, |
| label_1_count: int |
| ) -> Tuple[np.ndarray, np.ndarray]: |
| """ |
| The fast version of DeLong's method for computing the covariance of |
| unadjusted AUC. |
| Args: |
| predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] |
| sorted such as the examples with label "1" are first |
| Returns: |
| (AUC value, DeLong covariance) |
| Reference: |
| @article{sun2014fast, |
| title={Fast Implementation of DeLong's Algorithm for |
| Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves}, |
| author={Xu Sun and Weichao Xu}, |
| journal={IEEE Signal Processing Letters}, |
| volume={21}, |
| number={11}, |
| pages={1389--1393}, |
| year={2014}, |
| publisher={IEEE} |
| } |
| """ |
| |
| m = label_1_count |
| n = predictions_sorted_transposed.shape[1] - m |
| positive_examples = predictions_sorted_transposed[:, :m] |
| negative_examples = predictions_sorted_transposed[:, m:] |
| k = predictions_sorted_transposed.shape[0] |
|
|
| tx = np.empty([k, m], dtype=float) |
| ty = np.empty([k, n], dtype=float) |
| tz = np.empty([k, m + n], dtype=float) |
| for r in range(k): |
| tx[r, :] = compute_midrank(positive_examples[r, :]) |
| ty[r, :] = compute_midrank(negative_examples[r, :]) |
| tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) |
| aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n |
| v01 = (tz[:, :m] - tx[:, :]) / n |
| v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m |
| sx = np.cov(v01) |
| sy = np.cov(v10) |
| delongcov = sx / m + sy / n |
| return aucs, delongcov |
|
|
|
|
| def calc_pvalue( |
| aucs: np.ndarray, |
| sigma: np.ndarray |
| ) -> float: |
| """Computes log(10) of p-values. |
| Args: |
| aucs: 1D array of AUCs |
| sigma: AUC DeLong covariances |
| Returns: |
| log10(pvalue) |
| """ |
| l = np.array([[1, -1]]) |
| z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T)) |
| return float(np.log10(2) + stats.norm.logsf(z, loc=0, scale=1).item() / np.log(10)) |
|
|
|
|
|
|
| def compute_ground_truth_statistics( |
| ground_truth: np.ndarray, |
| sample_weight: Optional[np.ndarray] = None |
| ) -> Tuple[np.ndarray, int, Optional[np.ndarray]]: |
| assert np.array_equal(np.unique(ground_truth), [0, 1]) |
| order = (-ground_truth).argsort() |
| label_1_count = int(ground_truth.sum()) |
| if sample_weight is None: |
| ordered_sample_weight = None |
| else: |
| ordered_sample_weight = sample_weight[order] |
|
|
| return order, label_1_count, ordered_sample_weight |
|
|
|
|
| def delong_roc_variance( |
| ground_truth: np.ndarray, |
| predictions: np.ndarray |
| ) -> Tuple[float, np.ndarray]: |
| """ |
| Computes ROC AUC variance for a single set of predictions |
| Args: |
| ground_truth: np.array of 0 and 1 |
| predictions: np.array of floats of the probability of being class 1 |
| """ |
| sample_weight = None |
| order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics( |
| ground_truth, sample_weight) |
| predictions_sorted_transposed = predictions[np.newaxis, order] |
| aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) |
| assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers" |
| return aucs[0], delongcov |
|
|
|
|
| def delong_roc_test( |
| ground_truth: np.ndarray, |
| predictions_one: np.ndarray, |
| predictions_two: np.ndarray |
| ) -> float: |
| """ |
| Computes log(p-value) for hypothesis that two ROC AUCs are different |
| Args: |
| ground_truth: np.array of 0 and 1 |
| predictions_one: predictions of the first model, |
| np.array of floats of the probability of being class 1 |
| predictions_two: predictions of the second model, |
| np.array of floats of the probability of being class 1 |
| """ |
| sample_weight = None |
| order, label_1_count, _ = compute_ground_truth_statistics(ground_truth) |
| predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order] |
| aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) |
| return calc_pvalue(aucs, delongcov) |
|
|
|
|
| def roc_auc_ci_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 0.95) -> Tuple[float, np.ndarray]: |
| auc, auc_cov = delong_roc_variance(y_true, y_pred) |
| auc_std = np.sqrt(auc_cov) |
|
|
| |
| if auc_std < 1e-10: |
| if auc == 1.0: |
| ci = np.array([1.0, 1.0]) |
| elif auc == 0.0: |
| ci = np.array([0.0, 0.0]) |
| else: |
| |
| ci = np.array([auc, auc]) |
| else: |
| lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2) |
| ci = stats.norm.ppf( |
| lower_upper_q, |
| loc=auc, |
| scale=auc_std) |
|
|
| |
| ci[ci > 1] = 1 |
| ci[ci < 0] = 0 |
|
|
| return auc, ci |
|
|
|
|
| def bootstrap_auc_ci( |
| y_true: np.ndarray, |
| y_score: np.ndarray, |
| n_bootstraps: int = 1000, |
| seed: int = 42 |
| ) -> Tuple[float, np.ndarray]: |
| rng = np.random.RandomState(seed) |
| aucs = [] |
|
|
| for _ in range(n_bootstraps): |
| indices = rng.randint(0, len(y_true), len(y_true)) |
| if len(np.unique(y_true[indices])) < 2: |
| continue |
| y_true_boot = y_true[indices] |
| y_score_boot = y_score[indices] |
| aucs.append(roc_auc_score(y_true_boot, y_score_boot)) |
|
|
| print("This gives an empirical confidence interval of the AUC using bootstrapping. It may differ slightly due to randomness.") |
| |
| aucs = np.array(aucs) |
| return np.mean(aucs), np.percentile(aucs, [2.5, 97.5]) |
| |
|
|
| def bootstrap_roc_curve_ci( |
| y_true: np.ndarray, |
| y_score: np.ndarray, |
| n_bootstraps: int = 1000, |
| seed: int = 42 |
| ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: |
| rng = np.random.RandomState(seed) |
| tpr_list = [] |
| fpr_linspace = np.linspace(0, 1, 100) |
|
|
| for _ in range(n_bootstraps): |
| indices = rng.randint(0, len(y_true), len(y_true)) |
| if len(np.unique(y_true[indices])) < 2: |
| continue |
| y_true_boot = y_true[indices] |
| y_score_boot = y_score[indices] |
|
|
| fpr_boot, tpr_boot, _ = roc_curve(y_true_boot, y_score_boot) |
| tpr_interp = np.interp(fpr_linspace, fpr_boot, tpr_boot) |
| tpr_interp[0] = 0.0 |
| tpr_list.append(tpr_interp) |
|
|
| tpr_arr = np.array(tpr_list) |
| tpr_mean = np.mean(tpr_arr, axis=0) |
| tpr_lower = np.percentile(tpr_arr, 2.5, axis=0) |
| tpr_upper = np.percentile(tpr_arr, 97.5, axis=0) |
|
|
| return fpr_linspace, tpr_mean, tpr_lower, tpr_upper |
|
|
|
|
| def _prepare_targets_scores( |
| y_true: np.ndarray, |
| y_score: np.ndarray |
| ): |
| """ |
| Detect task type & return (Y_onehot, Y_score_2D, n_classes, task_name) |
| Works for binary, multiclass and multilabel. For binary we make sure |
| to return TWO columns (neg / pos) so that the downstream loop over |
| classes [0, 1] is always valid. |
| """ |
| |
| if y_true.ndim == 1: |
| n_classes = int(np.max(y_true)) + 1 |
| if n_classes == 2: |
| task_name = "binary" |
|
|
| |
| y_true_1hot = np.column_stack([1 - y_true, y_true]) |
|
|
| |
| if y_score.ndim == 1: |
| y_score_2d = np.column_stack([1 - y_score, y_score]) |
| else: |
| if y_score.shape[1] == 1: |
| y_score_2d = np.column_stack([1 - y_score[:, 0], y_score[:, 0]]) |
| else: |
| y_score_2d = y_score |
|
|
| else: |
| task_name = "multiclass" |
| y_true_1hot = label_binarize(y_true, classes=list(range(n_classes))) |
| y_score_2d = y_score |
|
|
| |
| else: |
| task_name = "multilabel" |
| n_classes = y_true.shape[1] |
| y_true_1hot = y_true.astype(int) |
| y_score_2d = y_score |
|
|
| return y_true_1hot, y_score_2d, n_classes, task_name |
|
|
|
|
|
|
| def plot_roc_with_ci( |
| y_true: np.ndarray, |
| y_score: np.ndarray, |
| save_path: Optional[str] = None, |
| fig_title: Optional[str] = None, |
| n_bootstraps: int = 1000, |
| seed: int = 42, |
| ) -> None: |
| """ |
| Draw ROC curves (with 95 % CI) for binary / multiclass / multilabel setups |
| on one canvas with tidy sub-plots. |
| |
| Parameters |
| ---------- |
| y_true : array-like |
| * binary / multiclass : shape (N,) |
| * multilabel : shape (N, C) |
| y_score : array-like |
| probability scores – same shape as y_true except for binary |
| where shape can be (N,) or (N, 2) (class-1 prob in column 1) |
| save_path : str | None |
| if given, the figure is stored as PNG. |
| fig_title : str | None |
| custom super-title. Defaults to "ROC curves". |
| """ |
| Y, S, C, task = _prepare_targets_scores(y_true, y_score) |
|
|
| |
| n_rows = math.ceil(math.sqrt(C)) |
| n_cols = math.ceil(C / n_rows) |
| fig, axes = plt.subplots( |
| n_rows, n_cols, figsize=(4.5 * n_cols, 4.5 * n_rows), dpi=200, |
| squeeze=False |
| ) |
|
|
| |
| for cls in range(C): |
| y_true_cls = Y[:, cls] |
| y_score_cls = S[:, cls] |
|
|
| fpr, tpr_mean, tpr_low, tpr_up = bootstrap_roc_curve_ci( |
| y_true_cls, y_score_cls, |
| n_bootstraps=n_bootstraps, seed=seed |
| ) |
| auc, ci = roc_auc_ci_score(y_true_cls, y_score_cls) |
| ci = ci.tolist() |
| r, c = divmod(cls, n_cols) |
| ax = axes[r][c] |
|
|
| |
| ax.plot(fpr, tpr_mean, lw=1.5, label=f"AUC = {auc:.3f}, CI = {ci[0]:.3f} - {ci[1]:.3f}") |
| ax.fill_between(fpr, tpr_low, tpr_up, alpha=.25, label="95 % CI") |
| ax.plot([0, 1], [0, 1], "k--", lw=.8) |
|
|
| |
| ax.set_title(f"Class {cls}") |
| ax.set_xlabel("FPR") |
| ax.set_ylabel("TPR") |
| ax.set_xlim(0, 1) |
| ax.set_ylim(0, 1) |
| ax.grid(ls="--", alpha=.4) |
| ax.legend(fontsize=8, loc="lower right") |
|
|
| |
| for side in ["top", "right"]: |
| ax.spines[side].set_visible(False) |
|
|
| |
| for extra in range(C, n_rows * n_cols): |
| r, c = divmod(extra, n_cols) |
| fig.delaxes(axes[r][c]) |
|
|
| if fig_title: |
| title = fig_title |
| else: |
| title = f"ROC Curve (AUC = {auc:.3f}, 95% CI = {ci[0]:.3f} - {ci[1]:.3f})" |
| fig.suptitle(title, fontsize=14) |
| plt.tight_layout(rect=[0, 0.03, 1, 0.97]) |
|
|
| if save_path: |
| fig.savefig(save_path, dpi=300) |
| print(f"Saved ROC panel ➜ {save_path}") |
| else: |
| plt.show() |
|
|
|
|
|
|
|
|