jordyvl commited on
Commit
0a06e4b
1 Parent(s): 0bca13b

fix for equal mass binning

Browse files
Files changed (3) hide show
  1. ece.py +67 -23
  2. resnet110_c10_logits.p +0 -0
  3. test_resnet-cifar_logits.py +164 -0
ece.py CHANGED
@@ -41,23 +41,13 @@ More concretely, we provide a binned empirical estimator of top-1 calibration er
41
  _KWARGS_DESCRIPTION = """
42
  Calculates how good are predictions given some references, using certain scores
43
  Args:
44
- predictions: list of predictions to score. Each predictions
45
- should be a string with tokens separated by spaces.
46
- references: list of reference for each prediction. Each
47
- reference should be a string with tokens separated by spaces.
48
- y_true : array-like
49
- Ground truth labels.
50
- p_hat : array-like
51
- Array of confidence estimates.
52
  n_bins : int, default=15
53
  Number of bins of :math:`[\\frac{1}{n_{\\text{classes}},1]` for the confidence estimates.
54
- n_classes : int default=None
55
- Number of classes. Estimated from `y` and `y_pred` if not given.
56
  p : int, default=1
57
  Power of the calibration error, :math:`1 \\leq p \\leq \\infty`.
58
 
59
-
60
-
61
  Returns
62
  Expected calibration error (ECE), float.
63
 
@@ -97,24 +87,37 @@ def create_bins(n_bins=10, scheme="equal-range", bin_range=None, P=None):
97
 
98
  # split sorted probabilities into groups of approx equal size
99
  groups = np.array_split(np.sort(P), n_bins)
100
- bin_upper_edges = list()
101
 
 
 
102
  # rightmost entry per equal size group
103
- for cur_group in range(n_bins - 1):
104
- bin_upper_edges += [max(groups[cur_group])]
105
- bin_upper_edges += [1.01] # [np.inf] # always +1 for right edges
 
 
106
  bins = np.array(bin_upper_edges)
107
- # OverflowError: cannot convert float infinity to integer
108
 
109
  return bins
110
 
111
 
112
  def discretize_into_bins(P, bins):
113
- oneDbins = np.digitize(P, bins) - 1 # since bins contains extra righmost & leftmost bins
 
 
 
 
 
 
 
 
 
 
114
 
115
  # Fix to scipy.binned_dd_statistic:
116
  # Tie-breaking to the left for rightmost bin
117
  # Using `digitize`, values that fall on an edge are put in the right bin.
 
118
  # For the rightmost bin, we want values equal to the right
119
  # edge to be counted in the last bin, and not as an outlier.
120
 
@@ -130,6 +133,7 @@ def discretize_into_bins(P, bins):
130
  on_edge = np.where(
131
  (P[:, k] >= bins[-1]) & (np.around(P[:, k], decimal) == np.around(bins[-1], decimal))
132
  )[0]
 
133
  # Shift these points one bin to the left.
134
  oneDbins[on_edge, k] -= 1
135
 
@@ -138,16 +142,19 @@ def discretize_into_bins(P, bins):
138
 
139
  def manual_binned_statistic(P, y_correct, bins, statistic="mean"):
140
  bin_assignments = discretize_into_bins(np.expand_dims(P, 0), bins)[0]
 
 
141
  result = np.empty([len(bins)], float)
142
  result.fill(np.nan) # cannot assume each bin will have observations
143
 
144
  flatcount = np.bincount(bin_assignments, None)
 
145
  a = flatcount.nonzero()
146
 
147
  if statistic == "mean":
148
  flatsum = np.bincount(bin_assignments, y_correct)
149
  result[a] = flatsum[a] / flatcount[a]
150
- return result, bins, bin_assignments + 1 # fix for what happens in discretize_into_bins
151
 
152
 
153
  def bin_calibrated_accuracy(bins, proxy="upper-edge"):
@@ -168,16 +175,19 @@ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge", detail=False):
168
  Summary: weighted average over the accuracy/confidence difference of discrete bins of prediction probability
169
  """
170
 
171
- n_bins = len(bins) - 1
172
  bin_range = [min(bins), max(bins)]
173
 
174
  # average bin probability #55 for bin 50-60, mean per bin; or right/upper bin edges
175
- calibrated_acc = bin_calibrated_accuracy(bins, proxy="upper-edge")
176
 
177
  empirical_acc, bin_edges, bin_assignment = manual_binned_statistic(P, y_correct, bins)
178
  bin_numbers, weights_ece = np.unique(bin_assignment, return_counts=True)
179
  anindices = bin_numbers - 1 # reduce bin counts; left edge; indexes right by default
180
 
 
 
 
181
  # Expected calibration error
182
  if p < np.inf: # L^p-CE
183
  CE = np.average(
@@ -292,7 +302,7 @@ class ECE(evaluate.EvaluationModule):
292
  }
293
 
294
 
295
- def test_ECE():
296
  N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
297
  K = 5 # K class problem
298
 
@@ -308,7 +318,7 @@ def test_ECE():
308
  references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
309
  references = np.array(references, dtype=np.int64)
310
  predictions = np.array(predictions, dtype=np.float32)
311
- res = ECE()._compute(predictions, references)
312
  print(f"ECE: {res['ECE']}")
313
 
314
  res = ECE()._compute(predictions, references, detail=True)
@@ -324,6 +334,40 @@ def test_deterministic():
324
  print(f"ECE: {res['ECE']}\n {res}")
325
 
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  if __name__ == "__main__":
 
328
  test_deterministic()
329
  test_ECE()
 
41
  _KWARGS_DESCRIPTION = """
42
  Calculates how good are predictions given some references, using certain scores
43
  Args:
44
+ predictions: 2D Array of confidence estimates.
45
+ references: 1D Array of Ground truth indices.
 
 
 
 
 
 
46
  n_bins : int, default=15
47
  Number of bins of :math:`[\\frac{1}{n_{\\text{classes}},1]` for the confidence estimates.
 
 
48
  p : int, default=1
49
  Power of the calibration error, :math:`1 \\leq p \\leq \\infty`.
50
 
 
 
51
  Returns
52
  Expected calibration error (ECE), float.
53
 
 
87
 
88
  # split sorted probabilities into groups of approx equal size
89
  groups = np.array_split(np.sort(P), n_bins)
 
90
 
91
+ # is this really required?
92
+ bin_upper_edges = []
93
  # rightmost entry per equal size group
94
+ for cur_group in range(n_bins):
95
+ bin_upper_edges += [max(groups[cur_group])] # if upper edges is what we compare against
96
+ bin_upper_edges += [1] # always +1 for right edges
97
+ bin_upper_edges = sorted(list(set(bin_upper_edges))) # important for numerical conditions!
98
+ # might change number of bins :O
99
  bins = np.array(bin_upper_edges)
 
100
 
101
  return bins
102
 
103
 
104
  def discretize_into_bins(P, bins):
105
+
106
+ contains_rightmost = bool(bins[-1] > 1) #outlier bins
107
+ contains_leftmost = bool(bins[0] == 0) #beyond [before] bin_range[0]
108
+ # bins_with_left_edge = np.insert(bins, 0, 0, axis=0)
109
+
110
+ oneDbins = np.digitize(
111
+ P, bins, right=contains_rightmost
112
+ ) # since bins contains extra righmost (& leftmost bins)
113
+ if contains_leftmost:
114
+ oneDbins -= 1
115
+ # oneDbins = np.digitize(P, bins) - 1 # since bins contains extra righmost (& leftmost bins)
116
 
117
  # Fix to scipy.binned_dd_statistic:
118
  # Tie-breaking to the left for rightmost bin
119
  # Using `digitize`, values that fall on an edge are put in the right bin.
120
+
121
  # For the rightmost bin, we want values equal to the right
122
  # edge to be counted in the last bin, and not as an outlier.
123
 
 
133
  on_edge = np.where(
134
  (P[:, k] >= bins[-1]) & (np.around(P[:, k], decimal) == np.around(bins[-1], decimal))
135
  )[0]
136
+
137
  # Shift these points one bin to the left.
138
  oneDbins[on_edge, k] -= 1
139
 
 
142
 
143
  def manual_binned_statistic(P, y_correct, bins, statistic="mean"):
144
  bin_assignments = discretize_into_bins(np.expand_dims(P, 0), bins)[0]
145
+
146
+ # indexed as in julia!
147
  result = np.empty([len(bins)], float)
148
  result.fill(np.nan) # cannot assume each bin will have observations
149
 
150
  flatcount = np.bincount(bin_assignments, None)
151
+ # cannot have a negative index
152
  a = flatcount.nonzero()
153
 
154
  if statistic == "mean":
155
  flatsum = np.bincount(bin_assignments, y_correct)
156
  result[a] = flatsum[a] / flatcount[a]
157
+ return result, bins, bin_assignments + 1 # upper right edge as proxy
158
 
159
 
160
  def bin_calibrated_accuracy(bins, proxy="upper-edge"):
 
175
  Summary: weighted average over the accuracy/confidence difference of discrete bins of prediction probability
176
  """
177
 
178
+ n_bins = len(bins) - 1 #true number of bins
179
  bin_range = [min(bins), max(bins)]
180
 
181
  # average bin probability #55 for bin 50-60, mean per bin; or right/upper bin edges
182
+ calibrated_acc = bin_calibrated_accuracy(bins, proxy=proxy)
183
 
184
  empirical_acc, bin_edges, bin_assignment = manual_binned_statistic(P, y_correct, bins)
185
  bin_numbers, weights_ece = np.unique(bin_assignment, return_counts=True)
186
  anindices = bin_numbers - 1 # reduce bin counts; left edge; indexes right by default
187
 
188
+ import pdb; pdb.set_trace() # breakpoint 83c9148b //
189
+
190
+
191
  # Expected calibration error
192
  if p < np.inf: # L^p-CE
193
  CE = np.average(
 
302
  }
303
 
304
 
305
+ def test_ECE(**kwargs):
306
  N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
307
  K = 5 # K class problem
308
 
 
318
  references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
319
  references = np.array(references, dtype=np.int64)
320
  predictions = np.array(predictions, dtype=np.float32)
321
+ res = ECE()._compute(predictions, references, **kwargs)
322
  print(f"ECE: {res['ECE']}")
323
 
324
  res = ECE()._compute(predictions, references, detail=True)
 
334
  print(f"ECE: {res['ECE']}\n {res}")
335
 
336
 
337
+ def test_equalmass_binning():
338
+ probs = np.array([0.63, 0.2, 0.2, 0, 0.95, 0.05, 0.72, 0.1, 0.2])
339
+
340
+ kwargs = dict(
341
+ n_bins=5,
342
+ scheme="equal-mass",
343
+ bin_range=None,
344
+ proxy="upper-edge",
345
+ # proxy="center",
346
+ p=1,
347
+ detail=True,
348
+ )
349
+ bins = create_bins(
350
+ n_bins=kwargs["n_bins"], scheme=kwargs["scheme"], bin_range=kwargs["bin_range"], P=probs
351
+ )
352
+
353
+ test_ECE(**kwargs)
354
+
355
+ """
356
+ res = ECE()._compute(
357
+ references=[0, 1, 2],
358
+ predictions=[[0.63, 0.2, 0.2], [0, 0.95, 0.05], [0.72, 0.1, 0.2]],
359
+ detail=True,
360
+ )
361
+ print(f"ECE: {res['ECE']}\n {res}")
362
+ """
363
+ # need to provide lens
364
+
365
+ import pdb
366
+
367
+ pdb.set_trace() # breakpoint 94583f9f //
368
+
369
+
370
  if __name__ == "__main__":
371
+ test_equalmass_binning()
372
  test_deterministic()
373
  test_ECE()
resnet110_c10_logits.p ADDED
Binary file (685 kB). View file
 
test_resnet-cifar_logits.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This testing script loads actual probabilisitic predictions from a resnet finetuned on CIFAR
3
+
4
+ There are a number of logits-groundtruth pickles available @ https://github.com/markus93/NN_calibration/tree/master/logits
5
+ [Seems to have moved from Git-LFS to sharepoint]
6
+ https://tartuulikool-my.sharepoint.com/:f:/g/personal/markus93_ut_ee/EmW0xbhcic5Ou0lRbTrySOUBF2ccSsN7lo6lvSfuG1djew?e=l0TErb
7
+
8
+ See https://github.com/markus93/NN_calibration/blob/master/logits/Readme.txt to decode the [model_dataset] filenames
9
+
10
+ As a bonus, one could consider temperature scaling and measuring after calibration.
11
+ """
12
+ import sys
13
+ import numpy as np
14
+ import scipy.stats as stats
15
+ from scipy.special import softmax
16
+ import pickle
17
+ from sklearn.model_selection import train_test_split
18
+
19
+ from matplotlib import pyplot as plt
20
+
21
+ from ece import create_bins, discretize_into_bins, ECE
22
+
23
+
24
+ # Open file with pickled variables
25
+ def unpickle_probs(file, verbose=0, normalize=True):
26
+ with open(file, "rb") as f: # Python 3: open(..., 'rb')
27
+ y1, y2 = pickle.load(f) # unpickle the content
28
+
29
+ if isinstance(y1, tuple):
30
+ y_probs_val, y_val = y1
31
+ y_probs_test, y_test = y2
32
+ else:
33
+ y_probs_val, y_probs_test, y_val, y_test = train_test_split(
34
+ y1, y2.reshape(-1, 1), test_size=len(y2) - 5000, random_state=15
35
+ ) # Splits the data in the case of pretrained models
36
+
37
+ if normalize:
38
+ y_probs_val = softmax(y_probs_val, -1)
39
+ y_probs_test = softmax(y_probs_test, -1)
40
+
41
+ if verbose:
42
+ print(
43
+ "y_probs_val:", y_probs_val.shape
44
+ ) # (5000, 10); Validation set probabilities of predictions
45
+ print("y_true_val:", y_val.shape) # (5000, 1); Validation set true labels
46
+ print("y_probs_test:", y_probs_test.shape) # (10000, 10); Test set probabilities
47
+ print("y_true_test:", y_test.shape) # (10000, 1); Test set true labels
48
+
49
+ return ((y_probs_val, y_val.ravel()), (y_probs_test, y_test.ravel()))
50
+
51
+
52
+ def unpickle_structured_probs(valpath=None, testpath=None):
53
+ valpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_validation_UTY.pickle"
54
+ testpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_test_UTY.pickle"
55
+
56
+ with open(valpath, "rb") as f:
57
+ X_val, _, y_val, _ = pickle.load(f)
58
+
59
+ with open(testpath, "rb") as f:
60
+ X_test, _, y_test, _ = pickle.load(f)
61
+
62
+ X_val = np.log(X_val) # originally exponentiated [different purposes]
63
+ X_test = np.log(X_test) # originally exponentiated [different purposes]
64
+ # structured logits
65
+
66
+
67
+ """
68
+ ALTERNATE equal mass binning
69
+ """
70
+ # Define data types.
71
+ from typing import List, Tuple, NewType, TypeVar
72
+ Data = List[Tuple[float, float]] # List of (predicted_probability, true_label).
73
+ Bins = List[float] # List of bin boundaries, excluding 0.0, but including 1.0.
74
+ BinnedData = List[Data] # binned_data[i] contains the data in bin i.
75
+ T = TypeVar('T')
76
+
77
+ eps = 1e-6
78
+
79
+ def split(sequence: List[T], parts: int) -> List[List[T]]:
80
+ assert parts <= len(sequence), "more bins than probabilities"
81
+ part_size = int(np.ceil(len(sequence) * 1.0 / parts))
82
+ assert part_size * parts >= len(sequence), "no missing instances when partitioning"
83
+ assert (part_size - 1) * parts < len(sequence), "dropping 1 does not make for missing"
84
+ return [sequence[i:i + part_size] for i in range(0, len(sequence), part_size)]
85
+
86
+
87
+ def get_equal_bins(probs: List[float], n_bins: int=10) -> Bins:
88
+ """Get bins that contain approximately an equal number of data points."""
89
+ sorted_probs = sorted(probs)
90
+ binned_data = split(sorted_probs, n_bins)
91
+ bins: Bins = []
92
+ for i in range(len(binned_data) - 1):
93
+ last_prob = binned_data[i][-1]
94
+ next_first_prob = binned_data[i + 1][0]
95
+ bins.append((last_prob + next_first_prob) / 2.0)
96
+ bins.append(1.0)
97
+ bins = sorted(list(set(bins))) #this is the special thing!
98
+ return bins
99
+
100
+ def histedges_equalN(x, nbin):
101
+ npt = len(x)
102
+ return np.interp(np.linspace(0, npt, nbin + 1),
103
+ np.arange(npt),
104
+ np.sort(x))
105
+
106
+ '''
107
+ bin_upper_edges = histedges_equalN(P, n_bins)
108
+ #n, bins, patches = plt.hist(x, histedges_equalN(x, 10))
109
+ '''
110
+
111
+
112
+ def test_equalmass_binning(P, Y):
113
+ #probs = np.array([0.63, 0.2, 0.2, 0, 0.95, 0.05, 0.72, 0.1, 0.2])
114
+
115
+ kwargs = dict(
116
+ n_bins= 10,
117
+ scheme="equal-mass",
118
+ bin_range=None,
119
+ proxy="upper-edge",
120
+ #proxy="center",
121
+ p=1,
122
+ detail=True,
123
+ )
124
+
125
+ if P.ndim == 2: #can assume ECE
126
+ p_max = np.max(P, -1) # create p̂ as top-1 softmax probability € [0,1]
127
+
128
+ eqr_bins = create_bins(n_bins=kwargs["n_bins"], scheme="equal-range", bin_range=kwargs["bin_range"], P=p_max)
129
+ eqm_bins = create_bins(n_bins=kwargs["n_bins"], scheme=kwargs["scheme"], bin_range=kwargs["bin_range"], P=p_max)
130
+ #alternate_eqm_bins = get_equal_bins(p_max, kwargs["n_bins"])
131
+
132
+
133
+ eqr_hist = np.digitize(p_max, eqr_bins, right=True)
134
+ eqm_hist = np.digitize(p_max, eqm_bins, right=True)
135
+ eqml_hist = np.digitize(p_max, eqm_bins, right=False)
136
+
137
+ #eqm_bins = [0] + eqm_bins
138
+
139
+ other_hist = discretize_into_bins(np.expand_dims(p_max, 0), eqm_bins)
140
+ hist_difference = stats.power_divergence(eqr_hist, eqm_hist, lambda_="pearson") #chisquare
141
+
142
+ #plt.hist(eqr_hist, color="green", label="equal-range")
143
+ plt.hist(eqm_hist, color="blue", label="equal-mass")
144
+ plt.legend()
145
+ #plt.show()
146
+
147
+
148
+ res = ECE()._compute(P, Y, **kwargs)
149
+ print(f"eqm ECE: {res['ECE']}")
150
+
151
+ kwargs["scheme"] = "equal-range"
152
+ res = ECE()._compute(P, Y, **kwargs)
153
+ print(f"eqr ECE: {res['ECE']}")
154
+
155
+ # res = ECE()._compute(predictions, references, detail=True)
156
+ # print(f"ECE: {res['ECE']}")
157
+
158
+
159
+
160
+ if __name__ == "__main__":
161
+ FILE_PATH = sys.argv[1] if len(sys.argv) > 1 else "resnet110_c10_logits.p"
162
+ (p_val, y_val), (p_test, y_test) = unpickle_probs(FILE_PATH, False, True)
163
+ test_equalmass_binning(p_val, y_val)
164
+ # do on val