Spaces:

per
/

benchbench

Running

App Files Files Community

Yotam-Perlitz commited on Sep 11, 2024

Commit

a3b611d

1 Parent(s): 9e72aa4

improve logic

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show

app.py +169 -67

app.py CHANGED Viewed

@@ -8,6 +8,26 @@ from bat import Benchmark, Config, Reporter, Tester
 from datetime import datetime
 holistic_scenarios = [
     "Helm Lite",
     "HF OpenLLM v2",
@@ -21,14 +41,38 @@ holistic_scenarios = [
 st.markdown(
-    """<h1 style='text-align: center; color: black;'>🏋️‍♂️ BenchBench Leaderboard 🏋️‍♂️</h1>""",
     unsafe_allow_html=True,
 )
 st.markdown(
     """
-    This leaderboard, featured in our work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696),
-    serves as a meta-benchmark. It ranks individual benchmarks based on their agreement with an aggregated reference benchmark, which harnesses insights from numerous diverse benchmarks.
     """
 )
@@ -38,26 +82,19 @@ all_scenarios_for_aggragate = (
     all_scenarios_for_aggragate.df["scenario"].unique().tolist()
 )
-st.subheader("The Leaderboard", divider=True)
-# st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
-with st.form("my_form_0"):
-    # leftcol, rightcol = st.columns([5, 1])
-    # with leftcol:
-    aggragate_scenarios = st.multiselect(
-        "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
-        all_scenarios_for_aggragate,
-        holistic_scenarios,
-    )
-    # with rightcol:
-    # st.markdown("###")
-    submitted = st.form_submit_button(label="\n\nRun BAT\n\n")
-with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
     with st.form("my_form_1"):
         corr_type = st.selectbox(
-            label="Select Correlation type", options=["kendall", "pearson"], index=0
         )
         aggregate_scenario_whitelist = aggragate_scenarios
@@ -68,13 +105,13 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
         # ]
         model_select_strategy = st.selectbox(
-            label="Select strategy",
             options=["random", "top_aggregate", "somewhere_aggregate"],
             index=0,
         )
         n_models_taken_list = st.slider(
-            label="Select number of models to use",
             min_value=3,
             max_value=15,
             value=8,
@@ -82,46 +119,67 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
         n_models_taken_list = [n_models_taken_list]
-        n_exps = 10
         submitted = st.form_submit_button(label="Run BAT")
 with st.expander("Add your benchmarks here!", icon="🔥"):
-    uploaded_file = st.file_uploader("Add your benchmark as a CSV")
-    st.download_button(
-        label="Download example CSV",
-        data=pd.read_csv("assets/mybench_240901.csv")
-        .to_csv(index=False)
-        .encode("utf-8"),
-        file_name="mybench_240901.csv",
-        mime="text/csv",
     )
     my_benchmark = Benchmark()
     if uploaded_file is not None:
         df = pd.read_csv(uploaded_file)
         my_benchmark.assign_df(
             df,
             data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
-        )
-        allbench = Benchmark()
-        allbench.load_local_catalog()
-        allbench.add_aggregate(
-            new_col_name="aggregate",
-            agg_source_name="aggregate",
-            scenario_whitelist=aggregate_scenario_whitelist,
-            min_scenario_for_models_to_appear_in_agg=1
-            if len(aggregate_scenario_whitelist) == 1
-            else 3,
         )
         uploaded_models = my_benchmark.df[
             my_benchmark.df["source"].str.contains("uploaded")
         ]["model"].unique()
-        aggregate_models = allbench.df[allbench.df["source"].str.contains("aggregate")][
             "model"
         ].unique()
@@ -180,8 +238,12 @@ def run_load(
         aggregate_scores = pd.read_csv(
             cache_path.replace("agreement", "aggregate_scores")
         )
-        return agreements, aggregate_scores
     else:
         print("Cached results not found, calculating")
@@ -245,11 +307,12 @@ def run_load(
         aggragate_scores.to_csv(
             cache_path.replace("agreement", "aggregate_scores"), index=False
         )
-    return agreements, aggragate_scores
-agreements, aggragare_score_df = run_load(
     aggregate_scenario_whitelist=aggregate_scenario_whitelist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
@@ -275,17 +338,15 @@ z_scores["date"] = z_scores["source"].apply(
     else x.split(".csv")[0].split("_")[-2]
 )
-# print(z_scores["scenario"].unique().tolist())
-# z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
-z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
-# , format="%y%m%d"
 data = (
     z_scores.rename(
         columns={
             "scenario": "Benchmark",
-            "z_score": "Z Score",
             "corr_with_agg": corr_name,
             "p_value_of_corr_with_agg": "p-value of Corr.",
             # "n_models_of_corr_with_agg": "# Models Used",
@@ -293,7 +354,7 @@ data = (
             "date": "Snapshot Date",
         }
     )
-    .sort_values("Z Score", ascending=False)
     .reset_index(drop=True)
 )
@@ -308,10 +369,10 @@ def highlight_uploaded_benchmark(row):
 styled_data = (
     data.style.background_gradient(
-        subset=["Z Score"],
         cmap="RdYlGn",
-        vmin=-data["Z Score"].abs().max(),
-        vmax=data["Z Score"].abs().max(),
     )
     .apply(highlight_uploaded_benchmark, axis=1)
     .background_gradient(
@@ -320,17 +381,19 @@ styled_data = (
         vmin=0.1,
         vmax=1,
     )
-    .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
     .set_properties(**{"text-align": "center"})
 )
 cols_used = [
     "Benchmark",
-    "Z Score",
     corr_name,
     "p-value of Corr.",
     "Snapshot Date",
 ]
 st.dataframe(
     data=styled_data,
     column_order=cols_used,
@@ -348,7 +411,8 @@ aggragare_score_df.rename(
     },
     inplace=True,
 )
-with st.expander(label="Model scored by the aggragate"):
     st.dataframe(
         data=aggragare_score_df,
         hide_index=True,
@@ -632,6 +696,52 @@ with st.expander(label="Citations"):
     """
     )
 st.markdown(
     "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
     "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
@@ -648,14 +758,6 @@ st.write(r"""
             """)
-benchmarks = data["Benchmark"].unique().tolist()
-plotted_scenario = st.selectbox(
-    "Choose Benchmark to plot",
-    benchmarks,
-    index=benchmarks.index("LMSys Arena"),
-)
 fig = px.histogram(
     data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
 )

 from datetime import datetime
+st.set_page_config(
+    page_title="BenchBench",
+    page_icon="🏋️‍♂️",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items=None,
+)
+# # Inject custom CSS to set the width of the sidebar
+# st.markdown(
+#     """
+#     <style>
+#         section[data-testid="stSidebar"] {
+#             width: 200px !important; # Set the width to your desired value
+#         }
+#     </style>
+#     """,
+#     unsafe_allow_html=True,
+# )
 holistic_scenarios = [
     "Helm Lite",
     "HF OpenLLM v2",
 st.markdown(
+    """
+    <h1 style='text-align: center; color: black;'>🏋️‍♂️ BenchBench Leaderboard 🏋️‍♂️</h1>
+    """,
     unsafe_allow_html=True,
 )
+st.divider()
+st.markdown(
+    """
+    The BenchBench leaderboard ranks benchmarks based on their agreement with the *Aggregate Benchmark* – a comprehensive, combined measure of existing benchmark results.
+    \n
+    To achive it, we scraped results from multiple benchmarks (citations below) to allow for obtaining benchmark agreement results with a wide range of benchmark using a large set of models.
+    \n
+    BenchBench is for you if:
+    """
+)
 st.markdown(
     """
+    - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
+    - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
+    """
+)
+st.markdown(
+    """
+    In our work -- [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696),
+    we standardize BAT and show the importance of its configurations, notably,
+    the benchmarks we compare to, and the models we use to compare with, check it out int he sidebar.
+    \n
+    We show that agreements are best reporesented with the Z Score, the relative agreement of each benchmark to the Aggragate benchmark, as presented below.
     """
 )
     all_scenarios_for_aggragate.df["scenario"].unique().tolist()
 )
+with st.sidebar:
+    st.markdown("""# Configurations""")
+    # with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
     with st.form("my_form_1"):
+        aggragate_scenarios = st.multiselect(
+            "Aggregate Benchmark",
+            all_scenarios_for_aggragate,
+            holistic_scenarios,
+        )
         corr_type = st.selectbox(
+            label="Correlation type", options=["kendall", "pearson"], index=0
         )
         aggregate_scenario_whitelist = aggragate_scenarios
         # ]
         model_select_strategy = st.selectbox(
+            label="Model Select strategy",
             options=["random", "top_aggregate", "somewhere_aggregate"],
             index=0,
         )
         n_models_taken_list = st.slider(
+            label="Minimal number of models to use",
             min_value=3,
             max_value=15,
             value=8,
         n_models_taken_list = [n_models_taken_list]
+        n_exps = 5
         submitted = st.form_submit_button(label="Run BAT")
 with st.expander("Add your benchmarks here!", icon="🔥"):
+    aggbench = Benchmark()
+    aggbench.load_local_catalog()
+    aggbench.add_aggregate(
+        new_col_name="aggregate",
+        agg_source_name="aggregate",
+        scenario_whitelist=aggregate_scenario_whitelist,
+        min_scenario_for_models_to_appear_in_agg=1
+        if len(aggregate_scenario_whitelist) == 1
+        else 3,
+    )
+    agg_models = (
+        aggbench.df.query('scenario=="aggregate"').sample(n=10)["model"].tolist()
+    )
+    st.markdown(
+        "Adding your benchmark is as simple as uploading a csv with the following format, one column indicates the model and the other the benchmark scores."
+    )
+    st.dataframe(
+        pd.read_csv("assets/mybench_240901.csv"),
+        use_container_width=True,
+        hide_index=True,
+        height=200,
+    )
+    st.markdown(
+        "Not sure, what models you should run your benchmark on?" "\ntry these:"
     )
+    st.code(agg_models)
+    st.markdown("Got the data? Upload it here 👇:")
+    uploaded_file = st.file_uploader("Add your benchmark as a CSV")
     my_benchmark = Benchmark()
     if uploaded_file is not None:
+        st.markdown(
+            "Your benchmark has been uploaded, BAT results will soon be caluclated... check out its results here: [Benchmark BAT Report Card](#benchmark-report-card)"
+        )
         df = pd.read_csv(uploaded_file)
         my_benchmark.assign_df(
             df,
             data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
+            normalized_names=False,
         )
         uploaded_models = my_benchmark.df[
             my_benchmark.df["source"].str.contains("uploaded")
         ]["model"].unique()
+        aggregate_models = aggbench.df[aggbench.df["source"].str.contains("aggregate")][
             "model"
         ].unique()
         aggregate_scores = pd.read_csv(
             cache_path.replace("agreement", "aggregate_scores")
         )
+        allbench = Benchmark(
+            pd.read_csv(cache_path.replace("agreement", "allbench")),
+            normalized_names=True,
+        )
+        return agreements, aggregate_scores, allbench
     else:
         print("Cached results not found, calculating")
         aggragate_scores.to_csv(
             cache_path.replace("agreement", "aggregate_scores"), index=False
         )
+        allbench.df.to_csv(cache_path.replace("agreement", "allbench"), index=False)
+    return agreements, aggragate_scores, allbench
+agreements, aggragare_score_df, allbench = run_load(
     aggregate_scenario_whitelist=aggregate_scenario_whitelist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     else x.split(".csv")[0].split("_")[-2]
 )
+z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
+z_score_name = "Relative agreement (Z Score)"
 data = (
     z_scores.rename(
         columns={
             "scenario": "Benchmark",
+            "z_score": z_score_name,
             "corr_with_agg": corr_name,
             "p_value_of_corr_with_agg": "p-value of Corr.",
             # "n_models_of_corr_with_agg": "# Models Used",
             "date": "Snapshot Date",
         }
     )
+    .sort_values(z_score_name, ascending=False)
     .reset_index(drop=True)
 )
 styled_data = (
     data.style.background_gradient(
+        subset=[z_score_name],
         cmap="RdYlGn",
+        vmin=-data[z_score_name].abs().max(),
+        vmax=data[z_score_name].abs().max(),
     )
     .apply(highlight_uploaded_benchmark, axis=1)
     .background_gradient(
         vmin=0.1,
         vmax=1,
     )
+    .format(subset=[z_score_name, corr_name, "p-value of Corr."], formatter="{:.2}")
     .set_properties(**{"text-align": "center"})
 )
 cols_used = [
     "Benchmark",
+    z_score_name,
     corr_name,
     "p-value of Corr.",
     "Snapshot Date",
 ]
 st.dataframe(
     data=styled_data,
     column_order=cols_used,
     },
     inplace=True,
 )
+with st.expander(label="Aggragate Benchmark scores"):
     st.dataframe(
         data=aggragare_score_df,
         hide_index=True,
     """
     )
+st.subheader("Benchmark Report Card")
+benchmarks = allbench.df["scenario"].unique().tolist()
+index_to_use = 0
+if not my_benchmark.is_empty:
+    index_to_use = benchmarks.index(my_benchmark.df["scenario"].unique()[0])
+plotted_scenario = st.selectbox(
+    "Choose Benchmark to plot",
+    benchmarks,
+    index=index_to_use,
+)
+col1, col2, col3 = st.columns(3)
+cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
+col1.metric("Relative agreement", cur_data["Relative agreement (Z Score)"])
+col2.metric("Kendall Tau Corr.", cur_data["Kendall Tau Corr."])
+col3.metric("p-value of Corr.", cur_data["p-value of Corr."])
+cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
+# Filter models that are present in both scenarios
+models_in_both = cur_df.groupby("model")["scenario"].nunique().eq(2).index
+# Pivot the DataFrame to have scenarios as columns
+df_pivot = cur_df[cur_df["model"].isin(models_in_both)].pivot(
+    index="model", columns="scenario", values="score"
+)
+# Create the scatter plot using Plotly Express
+fig = px.scatter(
+    df_pivot,
+    x=df_pivot.columns[0],
+    y=df_pivot.columns[1],
+    trendline="ols",
+    labels={
+        df_pivot.columns[0]: df_pivot.columns[0],
+        df_pivot.columns[1]: df_pivot.columns[1],
+    },
+    hover_name=df_pivot.index,
+    title="Model Scores Comparison between Scenarios",
+)
+st.plotly_chart(fig, use_container_width=True)
 st.markdown(
     "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
     "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
             """)
 fig = px.histogram(
     data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
 )