jordyvl commited on
Commit
6a39113
1 Parent(s): 7d663d0

might have found a bug in binning

Browse files
Files changed (2) hide show
  1. app.py +166 -3
  2. local_app.py +43 -59
app.py CHANGED
@@ -1,8 +1,171 @@
1
  import evaluate
 
 
 
 
2
 
3
- from evaluate.utils import launch_gradio_widget
 
 
 
 
4
 
5
- module = evaluate.load("jordyvl/ece")
6
- launch_gradio_widget(module)
 
 
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import evaluate
2
+ import json
3
+ import sys
4
+ from pathlib import Path
5
+ import gradio as gr
6
 
7
+ import numpy as np
8
+ import pandas as pd
9
+ import ast
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.patches as mpatches
12
 
13
+ plt.rcParams["figure.dpi"] = 300
14
+ plt.switch_backend(
15
+ "agg"
16
+ ) # ; https://stackoverflow.com/questions/14694408/runtimeerror-main-thread-is-not-in-main-loop
17
 
18
 
19
+ def default_plot():
20
+ fig = plt.figure()
21
+ ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
22
+ ax2 = plt.subplot2grid((3, 1), (2, 0))
23
+ ranged = np.linspace(0, 1, 10)
24
+ ax1.plot(
25
+ ranged,
26
+ ranged,
27
+ color="darkgreen",
28
+ ls="dotted",
29
+ label="Perfect",
30
+ )
31
+
32
+ # Bin differences
33
+ ax1.set_ylabel("Conditional Expectation")
34
+ ax1.set_ylim([0, 1.05])
35
+ ax1.set_title("Reliability Diagram")
36
+ ax1.set_xlim([-0.05, 1.05]) # respective to bin range
37
+
38
+ # Bin frequencies
39
+ ax2.set_xlabel("Confidence")
40
+ ax2.set_ylabel("Count")
41
+ ax2.set_xlim([-0.05, 1.05]) # respective to bin range
42
+
43
+ return fig, ax1, ax2
44
+
45
+
46
+ def reliability_plot(results):
47
+ # DEV: might still need to write tests in case of equal mass binning
48
+ # DEV: nicer would be to plot like a polygon
49
+ # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
50
+
51
+ def over_under_confidence(results):
52
+ colors = []
53
+ for j, bin in enumerate(results["y_bar"]):
54
+ perfect = results["y_bar"][j]
55
+ empirical = results["p_bar"][j]
56
+
57
+ bin_color = (
58
+ "limegreen"
59
+ if np.allclose(perfect, empirical)
60
+ else "dodgerblue"
61
+ if empirical < perfect
62
+ else "orangered"
63
+ )
64
+ colors.append(bin_color)
65
+ return colors
66
+
67
+ fig, ax1, ax2 = default_plot()
68
+
69
+ # Bin differences
70
+ bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
71
+ B, bins, patches = ax1.hist(
72
+ results["y_bar"],
73
+ weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
74
+ bins=bins_with_left_edge,
75
+ )
76
+ colors = over_under_confidence(results)
77
+ for b in range(len(B)):
78
+ patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
79
+
80
+ ax1handles = [
81
+ mpatches.Patch(color="orangered", label="Overconfident"),
82
+ mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
83
+ mpatches.Patch(color="dodgerblue", label="Underconfident"),
84
+ ]
85
+
86
+ # Bin frequencies
87
+ anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
88
+ n_bins = len(results["y_bar"])
89
+ bin_freqs = np.zeros(n_bins)
90
+ bin_freqs[anindices] = results["bin_freq"]
91
+ B, newbins, patches = ax2.hist(
92
+ results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
93
+ )
94
+
95
+ acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
96
+ conf_plt = ax2.axvline(
97
+ x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
98
+ )
99
+
100
+ ax1.legend(loc="lower right", handles=ax1handles)
101
+ ax2.legend(handles=[acc_plt, conf_plt])
102
+ ax1.set_xticks(bins_with_left_edge)
103
+ ax2.set_xticks(bins_with_left_edge)
104
+ plt.tight_layout()
105
+ return fig
106
+
107
+
108
+ def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
109
+ # DEV: check on invalid datatypes with better warnings
110
+
111
+ if isinstance(data, pd.DataFrame):
112
+ data.dropna(inplace=True)
113
+
114
+ predictions = [
115
+ ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
116
+ for prediction in data["predictions"]
117
+ ]
118
+ references = [reference for reference in data["references"]]
119
+
120
+ results = metric._compute(
121
+ predictions,
122
+ references,
123
+ n_bins=n_bins,
124
+ scheme=scheme,
125
+ proxy=proxy,
126
+ p=p,
127
+ detail=True,
128
+ )
129
+ plot = reliability_plot(results)
130
+ return results["ECE"], plot
131
+
132
+
133
+ sliders = [
134
+ gr.Slider(0, 100, value=10, label="n_bins"),
135
+ gr.Slider(
136
+ 0, 100, value=None, label="bin_range", visible=False
137
+ ), # DEV: need to have a double slider
138
+ gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
139
+ gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
140
+ gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
141
+ ]
142
+
143
+ slider_defaults = [slider.value for slider in sliders]
144
+
145
+ # example data
146
+ component = gr.inputs.Dataframe(
147
+ headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
148
+ )
149
+
150
+ component.value = [
151
+ [[0.63, 0.2, 0.2], 0],
152
+ [[0.73, 0.1, 0.2], 2],
153
+ [[0, 0.95, 0.05], 1],
154
+ ]
155
+ sample_data = [[component] + slider_defaults]
156
+
157
+ local_path = Path(sys.path[0])
158
+ metric = evaluate.load("jordyvl/ece")
159
+ outputs = [gr.outputs.Textbox(label="ECE"), gr.Plot(label="Reliability diagram")]
160
+ # outputs[1].value = default_plot().__dict__ #DEV: Does not work in gradio; needs to be JSON encoded
161
+
162
+
163
+ iface = gr.Interface(
164
+ fn=compute_and_plot,
165
+ inputs=[component] + sliders,
166
+ outputs=outputs,
167
+ description=metric.info.description,
168
+ article=evaluate.utils.parse_readme(local_path / "README.md"),
169
+ title=f"Metric: {metric.name}",
170
+ # examples=sample_data; #DEV: ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
171
+ ).launch()
local_app.py CHANGED
@@ -7,7 +7,8 @@ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
  import ast
10
- #from ece import ECE # loads local instead
 
11
 
12
 
13
  import matplotlib.pyplot as plt
@@ -55,7 +56,7 @@ sample_data = [[component] + slider_defaults] ##json.dumps(df)
55
 
56
  local_path = Path(sys.path[0])
57
  metric = evaluate.load("jordyvl/ece")
58
- #ECE()
59
  # module = evaluate.load("jordyvl/ece")
60
  # launch_gradio_widget(module)
61
 
@@ -76,97 +77,80 @@ def default_plot():
76
  ls="dotted",
77
  label="Perfect",
78
  )
 
 
79
  ax1.set_ylabel("Conditional Expectation")
80
- ax1.set_ylim([-0.05, 1.05]) # respective to bin range
81
- ax1.legend(loc="lower right")
82
  ax1.set_title("Reliability Diagram")
 
83
 
84
  # Bin frequencies
85
  ax2.set_xlabel("Confidence")
86
  ax2.set_ylabel("Count")
87
  ax2.legend(loc="upper left") # , ncol=2
88
- plt.tight_layout()
89
- return fig
90
-
91
 
92
- def over_under_confidence(results):
93
- colors = []
94
- for j, bin in enumerate(results["y_bar"]):
95
- perfect = results["y_bar"][j]
96
- empirical = results["p_bar"][j]
97
- bin_color = (
98
- "limegreen"
99
- if perfect == empirical
100
- else "dodgerblue"
101
- if empirical < perfect
102
- else "orangered"
103
- )
104
- colors.append(bin_color)
105
- return colors
106
 
107
 
108
  def reliability_plot(results):
109
  # DEV: might still need to write tests in case of equal mass binning
110
- fig = plt.figure()
111
- ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
112
- ax2 = plt.subplot2grid((3, 1), (2, 0))
113
 
114
- n_bins = len(results["y_bar"])
115
- bin_range = [
116
- results["y_bar"][0] - results["y_bar"][0],
117
- results["y_bar"][-1],
118
- ] # np.linspace(0, 1, n_bins)
119
- # if upper edge then minus binsize; same for center [but half]
120
- # rwidth is dependent on the binning
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  B, bins, patches = ax1.hist(
122
- results["y_bar"], weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0)
 
 
123
  )
124
  colors = over_under_confidence(results)
125
  for b in range(len(B)):
126
  patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
127
 
128
- ranged = np.linspace(bin_range[0], bin_range[1], n_bins)
129
- ax1.plot(
130
- ranged,
131
- ranged,
132
- color="limegreen",
133
- ls="dotted",
134
- label="Perfect",
135
- )
136
  ax1handles = [
137
  mpatches.Patch(color="orangered", label="Overconfident"),
138
  mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
139
  mpatches.Patch(color="dodgerblue", label="Underconfident"),
140
  ]
141
 
 
142
  anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
 
143
  bin_freqs = np.zeros(n_bins)
144
  bin_freqs[anindices] = results["bin_freq"]
145
- ax2.hist(results["y_bar"], bins=results["y_bar"], weights=bin_freqs, color="midnightblue")
146
-
147
- # DEV: nicer would be to plot like a polygon
148
- # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
149
 
150
  acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
151
  conf_plt = ax2.axvline(
152
  x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
153
  )
154
- ax2.legend(handles=[acc_plt, conf_plt])
155
 
156
- # Bin differences
157
- ax1.set_ylabel("Conditional Expectation")
158
- ax1.set_ylim([0, 1.05]) # respective to bin range
159
  ax1.legend(loc="lower right", handles=ax1handles)
160
- ax1.set_title("Reliability Diagram")
161
- # ax1.set_xticks([0]+results["y_bar"])
162
- ax1.set_xlim([-0.05, 1.05]) # respective to bin range
163
-
164
- # Bin frequencies
165
- ax2.set_xlabel("Confidence")
166
- ax2.set_ylabel("Count")
167
- ax2.legend(loc="upper left") # , ncol=2
168
- # ax2.set_xticks([0, ]+results["y_bar"])
169
- ax2.set_xlim([-0.05, 1.05]) # respective to bin range
170
  plt.tight_layout()
171
  return fig
172
 
@@ -208,4 +192,4 @@ iface = gr.Interface(
208
  article=evaluate.utils.parse_readme(local_path / "README.md"),
209
  title=f"Metric: {metric.name}",
210
  # examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
211
- ).launch()
 
7
  import numpy as np
8
  import pandas as pd
9
  import ast
10
+
11
+ # from ece import ECE # loads local instead
12
 
13
 
14
  import matplotlib.pyplot as plt
 
56
 
57
  local_path = Path(sys.path[0])
58
  metric = evaluate.load("jordyvl/ece")
59
+ # ECE()
60
  # module = evaluate.load("jordyvl/ece")
61
  # launch_gradio_widget(module)
62
 
 
77
  ls="dotted",
78
  label="Perfect",
79
  )
80
+
81
+ # Bin differences
82
  ax1.set_ylabel("Conditional Expectation")
83
+ ax1.set_ylim([0, 1.05]) # respective to bin range
 
84
  ax1.set_title("Reliability Diagram")
85
+ ax1.set_xlim([-0.05, 1.05]) # respective to bin range
86
 
87
  # Bin frequencies
88
  ax2.set_xlabel("Confidence")
89
  ax2.set_ylabel("Count")
90
  ax2.legend(loc="upper left") # , ncol=2
91
+ ax2.set_xlim([-0.05, 1.05]) # respective to bin range
 
 
92
 
93
+ return fig, ax1, ax2
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def reliability_plot(results):
97
  # DEV: might still need to write tests in case of equal mass binning
98
+ # DEV: nicer would be to plot like a polygon
99
+ # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
 
100
 
101
+ def over_under_confidence(results):
102
+ colors = []
103
+ for j, bin in enumerate(results["y_bar"]):
104
+ perfect = results["y_bar"][j]
105
+ empirical = results["p_bar"][j]
106
+
107
+ bin_color = (
108
+ "limegreen"
109
+ if np.allclose(perfect, empirical)
110
+ else "dodgerblue"
111
+ if empirical < perfect
112
+ else "orangered"
113
+ )
114
+ colors.append(bin_color)
115
+ return colors
116
+
117
+ fig, ax1, ax2 = default_plot()
118
+
119
+ # Bin differences
120
+ bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
121
  B, bins, patches = ax1.hist(
122
+ results["y_bar"],
123
+ weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
124
+ bins=bins_with_left_edge,
125
  )
126
  colors = over_under_confidence(results)
127
  for b in range(len(B)):
128
  patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
129
 
 
 
 
 
 
 
 
 
130
  ax1handles = [
131
  mpatches.Patch(color="orangered", label="Overconfident"),
132
  mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
133
  mpatches.Patch(color="dodgerblue", label="Underconfident"),
134
  ]
135
 
136
+ # Bin frequencies
137
  anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
138
+ n_bins = len(results["y_bar"])
139
  bin_freqs = np.zeros(n_bins)
140
  bin_freqs[anindices] = results["bin_freq"]
141
+ B, newbins, patches = ax2.hist(
142
+ results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
143
+ )
 
144
 
145
  acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
146
  conf_plt = ax2.axvline(
147
  x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
148
  )
 
149
 
 
 
 
150
  ax1.legend(loc="lower right", handles=ax1handles)
151
+ ax2.legend(handles=[acc_plt, conf_plt])
152
+ ax1.set_xticks(bins_with_left_edge)
153
+ ax2.set_xticks(bins_with_left_edge)
 
 
 
 
 
 
 
154
  plt.tight_layout()
155
  return fig
156
 
 
192
  article=evaluate.utils.parse_readme(local_path / "README.md"),
193
  title=f"Metric: {metric.name}",
194
  # examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
195
+ ).launch()