Yotam-Perlitz commited on
Commit
baec6d9
·
1 Parent(s): dcfe1ca

fix csv saving

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show
  1. app.py +31 -4
app.py CHANGED
@@ -293,7 +293,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
293
  uploaded_file = st.file_uploader("add your benchmark as a CSV")
294
  st.download_button(
295
  label="Download example CSV",
296
- data=pd.read_csv("assets/mybench.csv").to_csv().encode("utf-8"),
297
  file_name="mybench.csv",
298
  mime="text/csv",
299
  )
@@ -341,7 +341,11 @@ def run_load(
341
  if os.path.exists(cache_path) and use_caching:
342
  print("Loading cached results...")
343
  agreements = pd.read_csv(cache_path)
344
- return agreements
 
 
 
 
345
 
346
  else:
347
  print("Cached results not found, calculating")
@@ -366,6 +370,10 @@ def run_load(
366
  min_scenario_for_models_to_appear_in_agg=5,
367
  )
368
 
 
 
 
 
369
  allbench = Benchmark()
370
  allbench.load_local_catalog()
371
 
@@ -387,11 +395,14 @@ def run_load(
387
  )
388
 
389
  agreements.to_csv(cache_path, index=False)
 
 
 
390
 
391
- return agreements
392
 
393
 
394
- agreements = run_load(
395
  aggragate_scenario_blacklist=aggragate_scenario_blacklist,
396
  n_models_taken_list=n_models_taken_list,
397
  model_select_strategy_list=[model_select_strategy],
@@ -467,6 +478,22 @@ st.dataframe(
467
  height=500,
468
  )
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  st.markdown(
471
  "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
472
  "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
 
293
  uploaded_file = st.file_uploader("add your benchmark as a CSV")
294
  st.download_button(
295
  label="Download example CSV",
296
+ data=pd.read_csv("assets/mybench.csv").to_csv(index=False).encode("utf-8"),
297
  file_name="mybench.csv",
298
  mime="text/csv",
299
  )
 
341
  if os.path.exists(cache_path) and use_caching:
342
  print("Loading cached results...")
343
  agreements = pd.read_csv(cache_path)
344
+ aggregate_scores = pd.read_csv(
345
+ cache_path.replace("agreement", "aggregate_scores")
346
+ )
347
+
348
+ return agreements, aggregate_scores
349
 
350
  else:
351
  print("Cached results not found, calculating")
 
370
  min_scenario_for_models_to_appear_in_agg=5,
371
  )
372
 
373
+ aggragate_scores = holistic.df.query('scenario=="aggregate"')[
374
+ ["model", "score"]
375
+ ].sort_values(by="score", ascending=False)
376
+
377
  allbench = Benchmark()
378
  allbench.load_local_catalog()
379
 
 
395
  )
396
 
397
  agreements.to_csv(cache_path, index=False)
398
+ aggragate_scores.to_csv(
399
+ cache_path.replace("agreement", "aggregate_scores"), index=False
400
+ )
401
 
402
+ return agreements, aggragate_scores
403
 
404
 
405
+ agreements, aggragare_score_df = run_load(
406
  aggragate_scenario_blacklist=aggragate_scenario_blacklist,
407
  n_models_taken_list=n_models_taken_list,
408
  model_select_strategy_list=[model_select_strategy],
 
478
  height=500,
479
  )
480
 
481
+ aggragare_score_df.rename(
482
+ columns={
483
+ "model": "Model",
484
+ "score": "Mean Win Rate over Selected Scenarios for Aggragate",
485
+ },
486
+ inplace=True,
487
+ )
488
+ with st.expander(label="Model scored by the aggragate"):
489
+ st.dataframe(
490
+ data=aggragare_score_df,
491
+ hide_index=True,
492
+ height=500,
493
+ use_container_width=True,
494
+ )
495
+
496
+
497
  st.markdown(
498
  "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
499
  "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "