Upload folder using huggingface_hub

714cf46 verified 7 days ago

12.9 kB

	import numpy as np
	import scipy.stats as stats
	import matplotlib.pyplot as plt
	import math
	from sklearn.metrics import roc_curve, roc_auc_score
	from typing import Tuple, Optional
	from sklearn.preprocessing import label_binarize

	# from https://github.com/PatWalters/comparing_classifiers/blob/master/delong_ci.py
	# from https://github.com/yandexdataschool/roc_comparison/blob/master/compare_auc_delong_xu.py

	# AUC comparison adapted from
	# https://github.com/Netflix/vmaf/
	def compute_midrank(
	x: np.ndarray
	) -> np.ndarray:
	"""Computes midranks.
	Args:
	x - a 1D numpy array
	Returns:
	array of midranks
	"""
	J = np.argsort(x)
	Z = x[J]
	N = len(x)
	T = np.zeros(N, dtype=float)
	i = 0
	while i < N:
	j = i
	while j < N and Z[j] == Z[i]:
	j += 1
	T[i:j] = 0.5*(i + j - 1)
	i = j
	T2 = np.empty(N, dtype=float)
	# Note(kazeevn) +1 is due to Python using 0-based indexing
	# instead of 1-based in the AUC formula in the paper
	T2[J] = T + 1
	return T2


	def compute_midrank_weight(
	x: np.ndarray,
	sample_weight: np.ndarray
	) -> np.ndarray:
	"""Computes midranks.
	Args:
	x - a 1D numpy array
	Returns:
	array of midranks
	"""
	J = np.argsort(x)
	Z = x[J]
	cumulative_weight = np.cumsum(sample_weight[J])
	N = len(x)
	T = np.zeros(N, dtype=float)
	i = 0
	while i < N:
	j = i
	while j < N and Z[j] == Z[i]:
	j += 1
	T[i:j] = cumulative_weight[i:j].mean()
	i = j
	T2 = np.empty(N, dtype=float)
	T2[J] = T
	return T2


	def fastDeLong(
	predictions_sorted_transposed: np.ndarray,
	label_1_count: int
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	The fast version of DeLong's method for computing the covariance of
	unadjusted AUC.
	Args:
	predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
	sorted such as the examples with label "1" are first
	Returns:
	(AUC value, DeLong covariance)
	Reference:
	@article{sun2014fast,
	title={Fast Implementation of DeLong's Algorithm for
	Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
	author={Xu Sun and Weichao Xu},
	journal={IEEE Signal Processing Letters},
	volume={21},
	number={11},
	pages={1389--1393},
	year={2014},
	publisher={IEEE}
	}
	"""
	# Short variables are named as they are in the paper
	m = label_1_count
	n = predictions_sorted_transposed.shape[1] - m
	positive_examples = predictions_sorted_transposed[:, :m]
	negative_examples = predictions_sorted_transposed[:, m:]
	k = predictions_sorted_transposed.shape[0]

	tx = np.empty([k, m], dtype=float)
	ty = np.empty([k, n], dtype=float)
	tz = np.empty([k, m + n], dtype=float)
	for r in range(k):
	tx[r, :] = compute_midrank(positive_examples[r, :])
	ty[r, :] = compute_midrank(negative_examples[r, :])
	tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
	aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
	v01 = (tz[:, :m] - tx[:, :]) / n
	v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
	sx = np.cov(v01)
	sy = np.cov(v10)
	delongcov = sx / m + sy / n
	return aucs, delongcov


	def calc_pvalue(
	aucs: np.ndarray,
	sigma: np.ndarray
	) -> float:
	"""Computes log(10) of p-values.
	Args:
	aucs: 1D array of AUCs
	sigma: AUC DeLong covariances
	Returns:
	log10(pvalue)
	"""
	l = np.array([[1, -1]])
	z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
	return float(np.log10(2) + stats.norm.logsf(z, loc=0, scale=1).item() / np.log(10))



	def compute_ground_truth_statistics(
	ground_truth: np.ndarray,
	sample_weight: Optional[np.ndarray] = None
	) -> Tuple[np.ndarray, int, Optional[np.ndarray]]:
	assert np.array_equal(np.unique(ground_truth), [0, 1])
	order = (-ground_truth).argsort()
	label_1_count = int(ground_truth.sum())
	if sample_weight is None:
	ordered_sample_weight = None
	else:
	ordered_sample_weight = sample_weight[order]

	return order, label_1_count, ordered_sample_weight


	def delong_roc_variance(
	ground_truth: np.ndarray,
	predictions: np.ndarray
	) -> Tuple[float, np.ndarray]:
	"""
	Computes ROC AUC variance for a single set of predictions
	Args:
	ground_truth: np.array of 0 and 1
	predictions: np.array of floats of the probability of being class 1
	"""
	sample_weight = None
	order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
	ground_truth, sample_weight)
	predictions_sorted_transposed = predictions[np.newaxis, order]
	aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
	assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
	return aucs[0], delongcov


	def delong_roc_test(
	ground_truth: np.ndarray,
	predictions_one: np.ndarray,
	predictions_two: np.ndarray
	) -> float:
	"""
	Computes log(p-value) for hypothesis that two ROC AUCs are different
	Args:
	ground_truth: np.array of 0 and 1
	predictions_one: predictions of the first model,
	np.array of floats of the probability of being class 1
	predictions_two: predictions of the second model,
	np.array of floats of the probability of being class 1
	"""
	sample_weight = None
	order, label_1_count, _ = compute_ground_truth_statistics(ground_truth)
	predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
	aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
	return calc_pvalue(aucs, delongcov)


	def roc_auc_ci_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 0.95) -> Tuple[float, np.ndarray]:
	auc, auc_cov = delong_roc_variance(y_true, y_pred)
	auc_std = np.sqrt(auc_cov)

	# Handle edge cases when auc_std is zero or very small
	if auc_std < 1e-10:
	if auc == 1.0:
	ci = np.array([1.0, 1.0])
	elif auc == 0.0:
	ci = np.array([0.0, 0.0])
	else:
	# If std dev is extremely low but AUC is not exactly 0 or 1
	ci = np.array([auc, auc])
	else:
	lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
	ci = stats.norm.ppf(
	lower_upper_q,
	loc=auc,
	scale=auc_std)

	# Ensure confidence intervals within [0,1]
	ci[ci > 1] = 1
	ci[ci < 0] = 0

	return auc, ci


	def bootstrap_auc_ci(
	y_true: np.ndarray,
	y_score: np.ndarray,
	n_bootstraps: int = 1000,
	seed: int = 42
	) -> Tuple[float, np.ndarray]:
	rng = np.random.RandomState(seed)
	aucs = []

	for _ in range(n_bootstraps):
	indices = rng.randint(0, len(y_true), len(y_true))
	if len(np.unique(y_true[indices])) < 2:
	continue
	y_true_boot = y_true[indices]
	y_score_boot = y_score[indices]
	aucs.append(roc_auc_score(y_true_boot, y_score_boot))

	print("This gives an empirical confidence interval of the AUC using bootstrapping. It may differ slightly due to randomness.")

	aucs = np.array(aucs)
	return np.mean(aucs), np.percentile(aucs, [2.5, 97.5])


	def bootstrap_roc_curve_ci(
	y_true: np.ndarray,
	y_score: np.ndarray,
	n_bootstraps: int = 1000,
	seed: int = 42
	) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
	rng = np.random.RandomState(seed)
	tpr_list = []
	fpr_linspace = np.linspace(0, 1, 100)

	for _ in range(n_bootstraps):
	indices = rng.randint(0, len(y_true), len(y_true))
	if len(np.unique(y_true[indices])) < 2:
	continue
	y_true_boot = y_true[indices]
	y_score_boot = y_score[indices]

	fpr_boot, tpr_boot, _ = roc_curve(y_true_boot, y_score_boot)
	tpr_interp = np.interp(fpr_linspace, fpr_boot, tpr_boot)
	tpr_interp[0] = 0.0
	tpr_list.append(tpr_interp)

	tpr_arr = np.array(tpr_list)
	tpr_mean = np.mean(tpr_arr, axis=0)
	tpr_lower = np.percentile(tpr_arr, 2.5, axis=0)
	tpr_upper = np.percentile(tpr_arr, 97.5, axis=0)

	return fpr_linspace, tpr_mean, tpr_lower, tpr_upper


	def _prepare_targets_scores(
	y_true: np.ndarray,
	y_score: np.ndarray
	):
	"""
	Detect task type & return (Y_onehot, Y_score_2D, n_classes, task_name)
	Works for binary, multiclass and multilabel. For binary we make sure
	to return TWO columns (neg / pos) so that the downstream loop over
	classes [0, 1] is always valid.
	"""
	# ---------- binary or multiclass (single-label) ----------
	if y_true.ndim == 1:
	n_classes = int(np.max(y_true)) + 1 # assumes labels start at 0
	if n_classes == 2:
	task_name = "binary"

	# --- one-hot targets (N, 2): [neg, pos] ------------
	y_true_1hot = np.column_stack([1 - y_true, y_true])

	# --- probability array (N, 2): P(neg), P(pos) -----
	if y_score.ndim == 1: # shape (N,)
	y_score_2d = np.column_stack([1 - y_score, y_score])
	else: # shape (N, k)
	if y_score.shape[1] == 1: # (N, 1)
	y_score_2d = np.column_stack([1 - y_score[:, 0], y_score[:, 0]])
	else: # already (N, 2)
	y_score_2d = y_score

	else: # -------- multiclass -------
	task_name = "multiclass"
	y_true_1hot = label_binarize(y_true, classes=list(range(n_classes)))
	y_score_2d = y_score # expected shape (N, C)

	# ---------- multilabel (already one-hot) ------------------
	else:
	task_name = "multilabel"
	n_classes = y_true.shape[1]
	y_true_1hot = y_true.astype(int)
	y_score_2d = y_score

	return y_true_1hot, y_score_2d, n_classes, task_name



	def plot_roc_with_ci(
	y_true: np.ndarray,
	y_score: np.ndarray,
	save_path: Optional[str] = None,
	fig_title: Optional[str] = None,
	n_bootstraps: int = 1000,
	seed: int = 42,
	) -> None:
	"""
	Draw ROC curves (with 95 % CI) for binary / multiclass / multilabel setups
	on one canvas with tidy sub-plots.

	Parameters
	----------
	y_true : array-like
	* binary / multiclass : shape (N,)
	* multilabel : shape (N, C)
	y_score : array-like
	probability scores – same shape as y_true except for binary
	where shape can be (N,) or (N, 2) (class-1 prob in column 1)
	save_path : str \| None
	if given, the figure is stored as PNG.
	fig_title : str \| None
	custom super-title. Defaults to "ROC curves".
	"""
	Y, S, C, task = _prepare_targets_scores(y_true, y_score)

	# -------- set up subplot grid -------------
	n_rows = math.ceil(math.sqrt(C))
	n_cols = math.ceil(C / n_rows)
	fig, axes = plt.subplots(
	n_rows, n_cols, figsize=(4.5 * n_cols, 4.5 * n_rows), dpi=200,
	squeeze=False
	)

	# -------- iterate over classes -------------
	for cls in range(C):
	y_true_cls = Y[:, cls]
	y_score_cls = S[:, cls]

	fpr, tpr_mean, tpr_low, tpr_up = bootstrap_roc_curve_ci(
	y_true_cls, y_score_cls,
	n_bootstraps=n_bootstraps, seed=seed
	)
	auc, ci = roc_auc_ci_score(y_true_cls, y_score_cls)
	ci = ci.tolist()
	r, c = divmod(cls, n_cols)
	ax = axes[r][c]

	# main ROC and band
	ax.plot(fpr, tpr_mean, lw=1.5, label=f"AUC = {auc:.3f}, CI = {ci[0]:.3f} - {ci[1]:.3f}")
	ax.fill_between(fpr, tpr_low, tpr_up, alpha=.25, label="95 % CI")
	ax.plot([0, 1], [0, 1], "k--", lw=.8)

	# cosmetics
	ax.set_title(f"Class {cls}")
	ax.set_xlabel("FPR")
	ax.set_ylabel("TPR")
	ax.set_xlim(0, 1)
	ax.set_ylim(0, 1)
	ax.grid(ls="--", alpha=.4)
	ax.legend(fontsize=8, loc="lower right")

	# drop spines
	for side in ["top", "right"]:
	ax.spines[side].set_visible(False)

	# hide empty panels if any
	for extra in range(C, n_rows * n_cols):
	r, c = divmod(extra, n_cols)
	fig.delaxes(axes[r][c])

	if fig_title:
	title = fig_title
	else:
	title = f"ROC Curve (AUC = {auc:.3f}, 95% CI = {ci[0]:.3f} - {ci[1]:.3f})"
	fig.suptitle(title, fontsize=14)
	plt.tight_layout(rect=[0, 0.03, 1, 0.97])

	if save_path:
	fig.savefig(save_path, dpi=300)
	print(f"Saved ROC panel ➜ {save_path}")
	else:
	plt.show()