Spaces:

per
/

benchbench

Running

App Files Files Community

Yotam-Perlitz commited on Aug 30, 2024

Commit

baec6d9

1 Parent(s): dcfe1ca

fix csv saving

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show

app.py +31 -4

app.py CHANGED Viewed

@@ -293,7 +293,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
 uploaded_file = st.file_uploader("add your benchmark as a CSV")
 st.download_button(
     label="Download example CSV",
-    data=pd.read_csv("assets/mybench.csv").to_csv().encode("utf-8"),
     file_name="mybench.csv",
     mime="text/csv",
 )
@@ -341,7 +341,11 @@ def run_load(
     if os.path.exists(cache_path) and use_caching:
         print("Loading cached results...")
         agreements = pd.read_csv(cache_path)
-        return agreements
     else:
         print("Cached results not found, calculating")
@@ -366,6 +370,10 @@ def run_load(
             min_scenario_for_models_to_appear_in_agg=5,
         )
         allbench = Benchmark()
         allbench.load_local_catalog()
@@ -387,11 +395,14 @@ def run_load(
         )
         agreements.to_csv(cache_path, index=False)
-    return agreements
-agreements = run_load(
     aggragate_scenario_blacklist=aggragate_scenario_blacklist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
@@ -467,6 +478,22 @@ st.dataframe(
     height=500,
 )
 st.markdown(
     "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
     "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "

 uploaded_file = st.file_uploader("add your benchmark as a CSV")
 st.download_button(
     label="Download example CSV",
+    data=pd.read_csv("assets/mybench.csv").to_csv(index=False).encode("utf-8"),
     file_name="mybench.csv",
     mime="text/csv",
 )
     if os.path.exists(cache_path) and use_caching:
         print("Loading cached results...")
         agreements = pd.read_csv(cache_path)
+        aggregate_scores = pd.read_csv(
+            cache_path.replace("agreement", "aggregate_scores")
+        )
+        return agreements, aggregate_scores
     else:
         print("Cached results not found, calculating")
             min_scenario_for_models_to_appear_in_agg=5,
         )
+        aggragate_scores = holistic.df.query('scenario=="aggregate"')[
+            ["model", "score"]
+        ].sort_values(by="score", ascending=False)
         allbench = Benchmark()
         allbench.load_local_catalog()
         )
         agreements.to_csv(cache_path, index=False)
+        aggragate_scores.to_csv(
+            cache_path.replace("agreement", "aggregate_scores"), index=False
+        )
+    return agreements, aggragate_scores
+agreements, aggragare_score_df = run_load(
     aggragate_scenario_blacklist=aggragate_scenario_blacklist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     height=500,
 )
+aggragare_score_df.rename(
+    columns={
+        "model": "Model",
+        "score": "Mean Win Rate over Selected Scenarios for Aggragate",
+    },
+    inplace=True,
+)
+with st.expander(label="Model scored by the aggragate"):
+    st.dataframe(
+        data=aggragare_score_df,
+        hide_index=True,
+        height=500,
+        use_container_width=True,
+    )
 st.markdown(
     "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
     "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "