This testing script loads actual probabilisitic predictions from a resnet finetuned on CIFAR
There are a number of logits-groundtruth pickles available @
[Seems to have moved from Git-LFS to sharepoint]
See to decode the [model_dataset] filenames
As a bonus, one could consider temperature scaling and measuring after calibration.
import sys
import numpy as np
import scipy.stats as stats
from scipy.special import softmax
import pickle
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from ece import create_bins, discretize_into_bins, ECE
# Open file with pickled variables
def unpickle_probs(file, verbose=0, normalize=True):
with open(file, "rb") as f: # Python 3: open(..., 'rb')
y1, y2 = pickle.load(f) # unpickle the content
if isinstance(y1, tuple):
y_probs_val, y_val = y1
y_probs_test, y_test = y2
y_probs_val, y_probs_test, y_val, y_test = train_test_split(
y1, y2.reshape(-1, 1), test_size=len(y2) - 5000, random_state=15
) # Splits the data in the case of pretrained models
if normalize:
y_probs_val = softmax(y_probs_val, -1)
y_probs_test = softmax(y_probs_test, -1)
if verbose:
"y_probs_val:", y_probs_val.shape
) # (5000, 10); Validation set probabilities of predictions
print("y_true_val:", y_val.shape) # (5000, 1); Validation set true labels
print("y_probs_test:", y_probs_test.shape) # (10000, 10); Test set probabilities
print("y_true_test:", y_test.shape) # (10000, 1); Test set true labels
return ((y_probs_val, y_val.ravel()), (y_probs_test, y_test.ravel()))
def unpickle_structured_probs(valpath=None, testpath=None):
valpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_validation_UTY.pickle"
testpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_test_UTY.pickle"
with open(valpath, "rb") as f:
X_val, _, y_val, _ = pickle.load(f)
with open(testpath, "rb") as f:
X_test, _, y_test, _ = pickle.load(f)
X_val = np.log(X_val) # originally exponentiated [different purposes]
X_test = np.log(X_test) # originally exponentiated [different purposes]
# structured logits
ALTERNATE equal mass binning
# Define data types.
from typing import List, Tuple, NewType, TypeVar
Data = List[Tuple[float, float]] # List of (predicted_probability, true_label).
Bins = List[float] # List of bin boundaries, excluding 0.0, but including 1.0.
BinnedData = List[Data] # binned_data[i] contains the data in bin i.
T = TypeVar('T')
eps = 1e-6
def split(sequence: List[T], parts: int) -> List[List[T]]:
assert parts <= len(sequence), "more bins than probabilities"
part_size = int(np.ceil(len(sequence) * 1.0 / parts))
assert part_size * parts >= len(sequence), "no missing instances when partitioning"
assert (part_size - 1) * parts < len(sequence), "dropping 1 does not make for missing"
return [sequence[i:i + part_size] for i in range(0, len(sequence), part_size)]
def get_equal_bins(probs: List[float], n_bins: int=10) -> Bins:
"""Get bins that contain approximately an equal number of data points."""
sorted_probs = sorted(probs)
binned_data = split(sorted_probs, n_bins)
bins: Bins = []
for i in range(len(binned_data) - 1):
last_prob = binned_data[i][-1]
next_first_prob = binned_data[i + 1][0]
bins.append((last_prob + next_first_prob) / 2.0)
bins = sorted(list(set(bins))) #this is the special thing!
return bins
def histedges_equalN(x, nbin):
npt = len(x)
return np.interp(np.linspace(0, npt, nbin + 1),
bin_upper_edges = histedges_equalN(P, n_bins)
#n, bins, patches = plt.hist(x, histedges_equalN(x, 10))
def test_equalmass_binning(P, Y):
#probs = np.array([0.63, 0.2, 0.2, 0, 0.95, 0.05, 0.72, 0.1, 0.2])
kwargs = dict(
n_bins= 10,
if P.ndim == 2: #can assume ECE
p_max = np.max(P, -1) # create p̂ as top-1 softmax probability € [0,1]
eqr_bins = create_bins(n_bins=kwargs["n_bins"], scheme="equal-range", bin_range=kwargs["bin_range"], P=p_max)
eqm_bins = create_bins(n_bins=kwargs["n_bins"], scheme=kwargs["scheme"], bin_range=kwargs["bin_range"], P=p_max)
#alternate_eqm_bins = get_equal_bins(p_max, kwargs["n_bins"])
eqr_hist = np.digitize(p_max, eqr_bins, right=True)
eqm_hist = np.digitize(p_max, eqm_bins, right=True)
eqml_hist = np.digitize(p_max, eqm_bins, right=False)
#eqm_bins = [0] + eqm_bins
other_hist = discretize_into_bins(np.expand_dims(p_max, 0), eqm_bins)
hist_difference = stats.power_divergence(eqr_hist, eqm_hist, lambda_="pearson") #chisquare
#plt.hist(eqr_hist, color="green", label="equal-range")
plt.hist(eqm_hist, color="blue", label="equal-mass")
res = ECE()._compute(P, Y, **kwargs)
print(f"eqm ECE: {res['ECE']}")
kwargs["scheme"] = "equal-range"
res = ECE()._compute(P, Y, **kwargs)
print(f"eqr ECE: {res['ECE']}")
# res = ECE()._compute(predictions, references, detail=True)
# print(f"ECE: {res['ECE']}")
if __name__ == "__main__":
FILE_PATH = sys.argv[1] if len(sys.argv) > 1 else "resnet110_c10_logits.p"
(p_val, y_val), (p_test, y_test) = unpickle_probs(FILE_PATH, False, True)
test_equalmass_binning(p_val, y_val)
# do on val