Spaces:

jordyvl
/

ece

Runtime error

App Files Files Community

jordyvl commited on Jun 30, 2022

Commit

6a39113

•

1 Parent(s): 7d663d0

might have found a bug in binning

Browse files

Files changed (2) hide show

app.py +166 -3
local_app.py +43 -59

app.py CHANGED Viewed

@@ -1,8 +1,171 @@
 import evaluate
-from evaluate.utils import launch_gradio_widget
-module = evaluate.load("jordyvl/ece")
-launch_gradio_widget(module)

 import evaluate
+import json
+import sys
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import pandas as pd
+import ast
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+plt.rcParams["figure.dpi"] = 300
+plt.switch_backend(
+    "agg"
+)  # ; https://stackoverflow.com/questions/14694408/runtimeerror-main-thread-is-not-in-main-loop
+def default_plot():
+    fig = plt.figure()
+    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
+    ax2 = plt.subplot2grid((3, 1), (2, 0))
+    ranged = np.linspace(0, 1, 10)
+    ax1.plot(
+        ranged,
+        ranged,
+        color="darkgreen",
+        ls="dotted",
+        label="Perfect",
+    )
+    # Bin differences
+    ax1.set_ylabel("Conditional Expectation")
+    ax1.set_ylim([0, 1.05])
+    ax1.set_title("Reliability Diagram")
+    ax1.set_xlim([-0.05, 1.05])  # respective to bin range
+    # Bin frequencies
+    ax2.set_xlabel("Confidence")
+    ax2.set_ylabel("Count")
+    ax2.set_xlim([-0.05, 1.05])  # respective to bin range
+    return fig, ax1, ax2
+def reliability_plot(results):
+    # DEV: might still need to write tests in case of equal mass binning
+    # DEV: nicer would be to plot like a polygon
+    # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
+    def over_under_confidence(results):
+        colors = []
+        for j, bin in enumerate(results["y_bar"]):
+            perfect = results["y_bar"][j]
+            empirical = results["p_bar"][j]
+            bin_color = (
+                "limegreen"
+                if np.allclose(perfect, empirical)
+                else "dodgerblue"
+                if empirical < perfect
+                else "orangered"
+            )
+            colors.append(bin_color)
+        return colors
+    fig, ax1, ax2 = default_plot()
+    # Bin differences
+    bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
+    B, bins, patches = ax1.hist(
+        results["y_bar"],
+        weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
+        bins=bins_with_left_edge,
+    )
+    colors = over_under_confidence(results)
+    for b in range(len(B)):
+        patches[b].set_facecolor(colors[b])  # color based on over/underconfidence
+    ax1handles = [
+        mpatches.Patch(color="orangered", label="Overconfident"),
+        mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
+        mpatches.Patch(color="dodgerblue", label="Underconfident"),
+    ]
+    # Bin frequencies
+    anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
+    n_bins = len(results["y_bar"])
+    bin_freqs = np.zeros(n_bins)
+    bin_freqs[anindices] = results["bin_freq"]
+    B, newbins, patches = ax2.hist(
+        results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
+    )
+    acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
+    conf_plt = ax2.axvline(
+        x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
+    )
+    ax1.legend(loc="lower right", handles=ax1handles)
+    ax2.legend(handles=[acc_plt, conf_plt])
+    ax1.set_xticks(bins_with_left_edge)
+    ax2.set_xticks(bins_with_left_edge)
+    plt.tight_layout()
+    return fig
+def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
+    # DEV: check on invalid datatypes with better warnings
+    if isinstance(data, pd.DataFrame):
+        data.dropna(inplace=True)
+    predictions = [
+        ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
+        for prediction in data["predictions"]
+    ]
+    references = [reference for reference in data["references"]]
+    results = metric._compute(
+        predictions,
+        references,
+        n_bins=n_bins,
+        scheme=scheme,
+        proxy=proxy,
+        p=p,
+        detail=True,
+    )
+    plot = reliability_plot(results)
+    return results["ECE"], plot
+sliders = [
+    gr.Slider(0, 100, value=10, label="n_bins"),
+    gr.Slider(
+        0, 100, value=None, label="bin_range", visible=False
+    ),  # DEV: need to have a double slider
+    gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
+    gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
+    gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
+]
+slider_defaults = [slider.value for slider in sliders]
+# example data
+component = gr.inputs.Dataframe(
+    headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
+)
+component.value = [
+    [[0.63, 0.2, 0.2], 0],
+    [[0.73, 0.1, 0.2], 2],
+    [[0, 0.95, 0.05], 1],
+]
+sample_data = [[component] + slider_defaults]
+local_path = Path(sys.path[0])
+metric = evaluate.load("jordyvl/ece")
+outputs = [gr.outputs.Textbox(label="ECE"), gr.Plot(label="Reliability diagram")]
+# outputs[1].value = default_plot().__dict__ #DEV: Does not work in gradio; needs to be JSON encoded
+iface = gr.Interface(
+    fn=compute_and_plot,
+    inputs=[component] + sliders,
+    outputs=outputs,
+    description=metric.info.description,
+    article=evaluate.utils.parse_readme(local_path / "README.md"),
+    title=f"Metric: {metric.name}",
+    # examples=sample_data; #DEV: ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
+).launch()

local_app.py CHANGED Viewed

@@ -7,7 +7,8 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 import ast
-#from ece import ECE  # loads local instead
 import matplotlib.pyplot as plt
@@ -55,7 +56,7 @@ sample_data = [[component] + slider_defaults]  ##json.dumps(df)
 local_path = Path(sys.path[0])
 metric = evaluate.load("jordyvl/ece")
-#ECE()
 # module = evaluate.load("jordyvl/ece")
 # launch_gradio_widget(module)
@@ -76,97 +77,80 @@ def default_plot():
         ls="dotted",
         label="Perfect",
     )
     ax1.set_ylabel("Conditional Expectation")
-    ax1.set_ylim([-0.05, 1.05])  # respective to bin range
-    ax1.legend(loc="lower right")
     ax1.set_title("Reliability Diagram")
     # Bin frequencies
     ax2.set_xlabel("Confidence")
     ax2.set_ylabel("Count")
     ax2.legend(loc="upper left")  # , ncol=2
-    plt.tight_layout()
-    return fig
-def over_under_confidence(results):
-    colors = []
-    for j, bin in enumerate(results["y_bar"]):
-        perfect = results["y_bar"][j]
-        empirical = results["p_bar"][j]
-        bin_color = (
-            "limegreen"
-            if perfect == empirical
-            else "dodgerblue"
-            if empirical < perfect
-            else "orangered"
-        )
-        colors.append(bin_color)
-    return colors
 def reliability_plot(results):
     # DEV: might still need to write tests in case of equal mass binning
-    fig = plt.figure()
-    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
-    ax2 = plt.subplot2grid((3, 1), (2, 0))
-    n_bins = len(results["y_bar"])
-    bin_range = [
-        results["y_bar"][0] - results["y_bar"][0],
-        results["y_bar"][-1],
-    ]  # np.linspace(0, 1, n_bins)
-    # if upper edge then minus binsize; same for center [but half]
-    # rwidth is dependent on the binning
     B, bins, patches = ax1.hist(
-        results["y_bar"], weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0)
     )
     colors = over_under_confidence(results)
     for b in range(len(B)):
         patches[b].set_facecolor(colors[b])  # color based on over/underconfidence
-    ranged = np.linspace(bin_range[0], bin_range[1], n_bins)
-    ax1.plot(
-        ranged,
-        ranged,
-        color="limegreen",
-        ls="dotted",
-        label="Perfect",
-    )
     ax1handles = [
         mpatches.Patch(color="orangered", label="Overconfident"),
         mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
         mpatches.Patch(color="dodgerblue", label="Underconfident"),
     ]
     anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
     bin_freqs = np.zeros(n_bins)
     bin_freqs[anindices] = results["bin_freq"]
-    ax2.hist(results["y_bar"], bins=results["y_bar"], weights=bin_freqs, color="midnightblue")
-    # DEV: nicer would be to plot like a polygon
-    # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
     acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
     conf_plt = ax2.axvline(
         x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
     )
-    ax2.legend(handles=[acc_plt, conf_plt])
-    # Bin differences
-    ax1.set_ylabel("Conditional Expectation")
-    ax1.set_ylim([0, 1.05])  # respective to bin range
     ax1.legend(loc="lower right", handles=ax1handles)
-    ax1.set_title("Reliability Diagram")
-    # ax1.set_xticks([0]+results["y_bar"])
-    ax1.set_xlim([-0.05, 1.05])  # respective to bin range
-    # Bin frequencies
-    ax2.set_xlabel("Confidence")
-    ax2.set_ylabel("Count")
-    ax2.legend(loc="upper left")  # , ncol=2
-    # ax2.set_xticks([0, ]+results["y_bar"])
-    ax2.set_xlim([-0.05, 1.05])  # respective to bin range
     plt.tight_layout()
     return fig
@@ -208,4 +192,4 @@ iface = gr.Interface(
     article=evaluate.utils.parse_readme(local_path / "README.md"),
     title=f"Metric: {metric.name}",
     # examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
-).launch()

 import numpy as np
 import pandas as pd
 import ast
+# from ece import ECE  # loads local instead
 import matplotlib.pyplot as plt
 local_path = Path(sys.path[0])
 metric = evaluate.load("jordyvl/ece")
+# ECE()
 # module = evaluate.load("jordyvl/ece")
 # launch_gradio_widget(module)
         ls="dotted",
         label="Perfect",
     )
+    # Bin differences
     ax1.set_ylabel("Conditional Expectation")
+    ax1.set_ylim([0, 1.05])  # respective to bin range
     ax1.set_title("Reliability Diagram")
+    ax1.set_xlim([-0.05, 1.05])  # respective to bin range
     # Bin frequencies
     ax2.set_xlabel("Confidence")
     ax2.set_ylabel("Count")
     ax2.legend(loc="upper left")  # , ncol=2
+    ax2.set_xlim([-0.05, 1.05])  # respective to bin range
+    return fig, ax1, ax2
 def reliability_plot(results):
     # DEV: might still need to write tests in case of equal mass binning
+    # DEV: nicer would be to plot like a polygon
+    # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
+    def over_under_confidence(results):
+        colors = []
+        for j, bin in enumerate(results["y_bar"]):
+            perfect = results["y_bar"][j]
+            empirical = results["p_bar"][j]
+            bin_color = (
+                "limegreen"
+                if np.allclose(perfect, empirical)
+                else "dodgerblue"
+                if empirical < perfect
+                else "orangered"
+            )
+            colors.append(bin_color)
+        return colors
+    fig, ax1, ax2 = default_plot()
+    # Bin differences
+    bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
     B, bins, patches = ax1.hist(
+        results["y_bar"],
+        weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
+        bins=bins_with_left_edge,
     )
     colors = over_under_confidence(results)
     for b in range(len(B)):
         patches[b].set_facecolor(colors[b])  # color based on over/underconfidence
     ax1handles = [
         mpatches.Patch(color="orangered", label="Overconfident"),
         mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
         mpatches.Patch(color="dodgerblue", label="Underconfident"),
     ]
+    # Bin frequencies
     anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
+    n_bins = len(results["y_bar"])
     bin_freqs = np.zeros(n_bins)
     bin_freqs[anindices] = results["bin_freq"]
+    B, newbins, patches = ax2.hist(
+        results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
+    )
     acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
     conf_plt = ax2.axvline(
         x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
     )
     ax1.legend(loc="lower right", handles=ax1handles)
+    ax2.legend(handles=[acc_plt, conf_plt])
+    ax1.set_xticks(bins_with_left_edge)
+    ax2.set_xticks(bins_with_left_edge)
     plt.tight_layout()
     return fig
     article=evaluate.utils.parse_readme(local_path / "README.md"),
     title=f"Metric: {metric.name}",
     # examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
+).launch()