Yotam-Perlitz commited on
Commit
1e20a46
β€’
1 Parent(s): 3ce2cf9

update example file

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

app.py CHANGED
@@ -7,221 +7,15 @@ import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
8
 
9
 
10
- def get_nice_benchmark_name(bench_name):
11
- prettified_names = {
12
- "holmes": "Holmes",
13
- "helm_lite_narrativeqa": "Helm Lite NarrativeQA",
14
- "helm_lite_naturalquestionsopen": "Helm Lite NaturalQuestionsOpen",
15
- "helm_lite_naturalquestionsclosed": "Helm Lite NaturalQuestionsClosed",
16
- "helm_lite_openbookqa": "Helm Lite OpenBookQA",
17
- "helm_lite_mmlu": "Helm Lite MMLU",
18
- "helm_lite_math_equivalentcot": "Helm Lite MathEquivalentCOT",
19
- "helm_lite_gsm8k": "Helm Lite GSM8K",
20
- "helm_lite_legalbench": "Helm Lite LegalBench",
21
- "helm_lite_medqa": "Helm Lite MedQA",
22
- "helm_lite_wmt2014": "Helm Lite WMT2014",
23
- "hfv2_bbh": "HFv2 BBH",
24
- "hfv2_bbh_raw": "HFv2 BBH Raw",
25
- "hfv2_gpqa": "HFv2 GPQA",
26
- "hfv2_ifeval": "HFv2 IFEval",
27
- "hfv2_math_lvl_5": "HFv2 Math Level 5",
28
- "hfv2_mmlu_pro": "HFv2 MMLU Pro",
29
- "hfv2_musr": "HFv2 MuSR",
30
- "oc_mmlu": "OpenCompass MMLU",
31
- "oc_mmlu_pro": "OpenCompass MMLU Pro",
32
- "oc_cmmlu": "OpenCompass CMMLU",
33
- "oc_bbh": "OpenCompass BBH",
34
- "oc_gqpa_dimand": "OpenCompass GQPA-Dimand",
35
- "oc_humaneval": "OpenCompass HumanEval",
36
- "oc_ifeval": "OpenCompass IFEval",
37
- "helm_mmlu": "Helm MMLU",
38
- "helm_boolq": "Helm BoolQ",
39
- "helm_narrativeqa": "Helm NarrativeQA",
40
- "helm_naturalquestionsclosed": "Helm NaturalQuestionsClosed",
41
- "helm_naturalquestionsopen": "Helm NaturalQuestionsOpen",
42
- "helm_quac": "Helm QuAC",
43
- "helm_openbookqa": "Helm OpenBookQA",
44
- "helm_imdb": "Helm IMDB",
45
- "helm_civilcomments": "Helm CivilComments",
46
- "helm_raft": "Helm RAFT",
47
- "mmlu_pro": "MMLU Pro",
48
- "mixeval_triviaqa": "MixEval TriviaQA",
49
- "mixeval_mmlu": "MixEval MMLU",
50
- "mixeval_drop": "MixEval DROP",
51
- "mixeval_hellaswag": "MixEval HellaSwag",
52
- "mixeval_commonsenseqa": "MixEval CommonsenseQA",
53
- "mixeval_triviaqa_hard": "MixEval TriviaQA Hard",
54
- "mixeval_mmlu_hard": "MixEval MMLU Hard",
55
- "mixeval_drop_hard": "MixEval DROP Hard",
56
- "oc_language": "OpenCompass Language",
57
- "oc_knowledge": "OpenCompass Knowledge",
58
- "oc_reasoning": "OpenCompass Reasoning",
59
- "oc_math": "OpenCompass Math",
60
- "oc_code": "OpenCompass Code",
61
- "oc_instruct": "OpenCompass Instruction",
62
- "oc_agent": "OpenCompass Agent",
63
- "oc_arena": "OpenCompass Arena",
64
- "lb_reasoning": "LiveBench Reasoning",
65
- "lb_coding": "LiveBench Coding",
66
- "lb_mathematics": "LiveBench Mathematics",
67
- "lb_data_analysis": "LiveBench Data Analysis",
68
- "lb_language": "LiveBench Language",
69
- "lb_if": "LiveBench Instruction Following",
70
- "wb_info_seek": "WildBench Information Seeking",
71
- "wb_creative": "WildBench Creative",
72
- "wb_code_debug": "WildBench Code Debugging",
73
- "wb_math_data": "WildBench Math & Data",
74
- "wb_reason_plan": "WildBench Reasoning & Planning",
75
- "wb_score": "WildBench Score",
76
- "hfv1_arc": "HFv1 ARC",
77
- "hfv1_gsm8k": "HFv1 GSM8K",
78
- "hfv1_hellaswag": "HFv1 HellaSwag",
79
- "hfv1_mmlu": "HFv1 MMLU",
80
- "hfv1_truthfulqa": "HFv1 TruthfulQA",
81
- "hfv1_winogrande": "HFv1 Winogrande",
82
- "biggen_grounding": "BigBench Grounding",
83
- "biggen_instruction_following": "BigBench Instruction Following",
84
- "biggen_planning": "BigBench Planning",
85
- "biggen_reasoning": "BigBench Reasoning",
86
- "biggen_refinement": "BigBench Refinement",
87
- "biggen_safety": "BigBench Safety",
88
- "biggen_theory_of_mind": "BigBench Theory of Mind",
89
- "biggen_tool_usage": "BigBench Tool Usage",
90
- "biggen_multilingual": "BigBench Multilingual",
91
- "lb_reasoning_average": "LiveBench Reasoning Average",
92
- "lb_coding_average": "LiveBench Coding Average",
93
- "lb_mathematics_average": "LiveBench Mathematics Average",
94
- "lb_data_analysis_average": "LiveBench Data Analysis Average",
95
- "lb_language_average": "LiveBench Language Average",
96
- "lb_if_average": "LiveBench Instruction Following Average",
97
- "helm_lite": "Helm Lite",
98
- "hf_open_llm_v2": "HF OpenLLM v2",
99
- "opencompass_academic": "OpenCompass Academic",
100
- "arena_elo": "Arena Elo",
101
- "helm_classic": "Helm Classic",
102
- "mixeval": "MixEval",
103
- "mixeval_hard": "MixEval Hard",
104
- "opencompass": "OpenCompass",
105
- "alphacaeval_v2lc": "AlphacaEval v2lc",
106
- "livebench_240725": "LiveBench 240725",
107
- "wb_elo_lc": "WildBench Elo LC",
108
- "arena_hard": "Arena Hard",
109
- "agentbench": "AgentBench",
110
- "hf_open_llm_v1": "HF OpenLLM v1",
111
- "biggen": "BigBench",
112
- "livebench_240624": "LiveBench 240624",
113
- "mt_bench": "MT-Bench",
114
- }
115
-
116
- if bench_name in prettified_names:
117
- return prettified_names[bench_name]
118
- else:
119
- return bench_name
120
-
121
-
122
  holistic_scenarios = [
123
- get_nice_benchmark_name(scen)
124
- for scen in [
125
- # "holmes",
126
- "helm_lite",
127
- # "narrativeqa",
128
- # "naturalquestionsopen",
129
- # "naturalquestionsclosed",
130
- # "openbookqa",
131
- # "mmlu",
132
- # "math_equivalentcot",
133
- # "gsm8k",
134
- # "legalbench",
135
- # "medqa",
136
- # "wmt2014",
137
- # "arc_c",
138
- # "arc_e",
139
- # "boolq",
140
- # "csqa",
141
- # "hellaswag",
142
- # "piqa",
143
- # "siqa",
144
- # "winogrande",
145
- # "olmes_average",
146
- # "bbh",
147
- # "bbh_raw",
148
- # "gpqa",
149
- "hf_open_llm_v2",
150
- # "ifeval",
151
- # "math_lvl_5",
152
- # "mmlu_pro",
153
- # "musr",
154
- "opencompass_academic",
155
- # "oc_mmlu",
156
- # "oc_mmlu_pro",
157
- # "oc_cmmlu",
158
- # "oc_bbh",
159
- # "oc_gqpa_dimand",
160
- # "oc_math",
161
- # "oc_humaneval",
162
- # "oc_ifeval",
163
- # "helm_mmlu",
164
- "arena_elo",
165
- "helm_classic",
166
- # "quac",
167
- # "truthfulqa",
168
- # "ms_marcoregular",
169
- # "ms_marcotrec",
170
- # "cnn/dailymail",
171
- # "xsum",
172
- # "imdb",
173
- # "civilcomments",
174
- # "raft",
175
- "mixeval_hard",
176
- "mixeval",
177
- # "arena_elo0527",
178
- "opencompass",
179
- # "oc_language",
180
- # "oc_knowledge",
181
- # "oc_reasoning",
182
- # "oc_code",
183
- # "oc_instruct",
184
- # "oc_agent",
185
- # "oc_arena",
186
- "alphacaeval_v2lc",
187
- "livebench_240725",
188
- "livebench_240624",
189
- # "lb_reasoning",
190
- # "lb_coding",
191
- # "lb_mathematics",
192
- # "lb_data_analysis",
193
- # "lb_language",
194
- # "lb_if",
195
- "wb_elo_lc",
196
- # "wb_info_seek",
197
- # "wb_creative",
198
- # "wb_code_debug",
199
- # "wb_math_data",
200
- # "wb_reason_plan",
201
- # "wb_score",
202
- # "boolqmixed",
203
- "arena_hard",
204
- "agentbench",
205
- # "arc",
206
- "hf_open_llm_v1",
207
- "biggen",
208
- # "biggen_grounding",
209
- # "biggen_instruction_following",
210
- # "biggen_planning",
211
- # "biggen_reasoning",
212
- # "biggen_refinement",
213
- # "biggen_safety",
214
- # "biggen_theory_of_mind",
215
- # "biggen_tool_usage",
216
- # "biggen_multilingual",
217
- # "lb_global_average",
218
- # "lb_reasoning_average",
219
- # "lb_coding_average",
220
- # "lb_mathematics_average",
221
- # "lb_data_analysis_average",
222
- # "lb_language_average",
223
- # "lb_if_average",
224
- ]
225
  ]
226
 
227
 
@@ -245,30 +39,31 @@ all_scenarios_for_aggragate = (
245
  st.subheader("The Leaderboard", divider=True)
246
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
247
 
248
- leftcol, rightcol = st.columns([2, 1])
249
 
250
- with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
251
- with st.form("my_form"):
252
- all_scenarios_for_aggragate_with_all = [
253
- get_nice_benchmark_name(scenario)
254
- for scenario in all_scenarios_for_aggragate
255
- ]
256
-
257
- aggragate_scenarios = st.multiselect(
258
- "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
259
- all_scenarios_for_aggragate,
260
- holistic_scenarios,
261
- )
262
 
 
 
263
  corr_type = st.selectbox(
264
  label="Select Correlation type", options=["kendall", "pearson"], index=0
265
  )
266
 
267
- aggragate_scenario_blacklist = [
268
- scen
269
- for scen in all_scenarios_for_aggragate
270
- if scen not in aggragate_scenarios
271
- ]
 
272
 
273
  model_select_strategy = st.selectbox(
274
  label="Select strategy",
@@ -289,23 +84,25 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
289
 
290
  submitted = st.form_submit_button(label="Run BAT")
291
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- uploaded_file = st.file_uploader("add your benchmark as a CSV")
294
- st.download_button(
295
- label="Download example CSV",
296
- data=pd.read_csv("assets/mybench.csv").to_csv(index=False).encode("utf-8"),
297
- file_name="mybench.csv",
298
- mime="text/csv",
299
- )
300
-
301
- my_benchmark = Benchmark()
302
- if uploaded_file is not None:
303
- df = pd.read_csv(uploaded_file)
304
- my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
305
 
306
 
307
  def run_load(
308
- aggragate_scenario_blacklist=[],
309
  n_models_taken_list=[5],
310
  model_select_strategy_list=["random"],
311
  corr_types=["kendall"],
@@ -315,7 +112,7 @@ def run_load(
315
  ):
316
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
317
  input_str = (
318
- str(aggragate_scenario_blacklist)
319
  + str(n_models_taken_list)
320
  + str(model_select_strategy_list)
321
  + str(corr_types)
@@ -358,25 +155,30 @@ def run_load(
358
  n_exps=n_exps if n_models_taken_list != [0] else 1,
359
  )
360
 
361
- holistic = Benchmark()
362
- holistic.load_local_catalog()
363
- holistic.df = holistic.df.query("scenario in @holistic_scenarios")
 
 
364
 
365
- holistic.clear_repeated_scenarios()
366
- holistic.add_aggragete(
 
 
 
 
 
 
367
  new_col_name="aggregate",
368
- agg_source_name="holistic",
369
- scenario_blacklist=aggragate_scenario_blacklist,
370
- min_scenario_for_models_to_appear_in_agg=5,
371
  )
372
 
373
- aggragate_scores = holistic.df.query('scenario=="aggregate"')[
374
  ["model", "score"]
375
  ].sort_values(by="score", ascending=False)
376
 
377
- allbench = Benchmark()
378
- allbench.load_local_catalog()
379
-
380
  # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
381
 
382
  allbench.extend(my_benchmark)
@@ -384,8 +186,8 @@ def run_load(
384
  allbench.clear_repeated_scenarios()
385
 
386
  # removing and adding the holistic scenarios
387
- allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
388
- allbench = allbench.extend(holistic)
389
 
390
  tester = Tester(cfg=cfg)
391
 
@@ -403,7 +205,7 @@ def run_load(
403
 
404
 
405
  agreements, aggragare_score_df = run_load(
406
- aggragate_scenario_blacklist=aggragate_scenario_blacklist,
407
  n_models_taken_list=n_models_taken_list,
408
  model_select_strategy_list=[model_select_strategy],
409
  corr_types=[corr_type],
@@ -422,12 +224,18 @@ z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
422
  z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
423
  # z_scores["n_models_of_corr_with_agg"] = z_scores["n_models_of_corr_with_agg"].round(1)
424
 
425
- z_scores["date"] = z_scores["source"].apply(lambda x: x.split(".csv")[0].split("_")[-1])
 
 
 
 
426
 
427
- # print(z_scores["scenario"].unique().tolist())
428
 
429
- z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
430
 
 
 
 
431
  data = (
432
  z_scores.rename(
433
  columns={
@@ -468,24 +276,26 @@ styled_data = (
468
  vmax=1,
469
  )
470
  .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
 
471
  )
472
 
473
- # print(data["Benchmark"].unique().tolist())
474
-
 
 
 
 
 
475
  st.dataframe(
476
  data=styled_data,
477
- column_order=[
478
- "Benchmark",
479
- "Z Score",
480
- corr_name,
481
- "p-value of Corr.",
482
- "Snapshot Date",
483
- ],
484
  hide_index=True,
485
  use_container_width=True,
486
  height=500,
 
487
  )
488
 
 
489
  aggragare_score_df.rename(
490
  columns={
491
  "model": "Model",
@@ -787,7 +597,7 @@ benchmarks = data["Benchmark"].unique().tolist()
787
  plotted_scenario = st.selectbox(
788
  "Choose Benchmark to plot",
789
  benchmarks,
790
- index=benchmarks.index("Arena Elo"),
791
  )
792
 
793
 
 
7
  from bat import Benchmark, Config, Reporter, Tester
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  holistic_scenarios = [
11
+ "Helm Lite",
12
+ "HF OpenLLM v2",
13
+ "OpenCompass Academic",
14
+ "LMSys Arena",
15
+ "Helm Classic",
16
+ "AlphacaEval v2lc",
17
+ "LiveBench 240725",
18
+ "WildBench Elo LC",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  ]
20
 
21
 
 
39
  st.subheader("The Leaderboard", divider=True)
40
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
41
 
 
42
 
43
+ with st.form("my_form_0"):
44
+ # leftcol, rightcol = st.columns([5, 1])
45
+ # with leftcol:
46
+ aggragate_scenarios = st.multiselect(
47
+ "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
48
+ all_scenarios_for_aggragate,
49
+ holistic_scenarios,
50
+ )
51
+ # with rightcol:
52
+ # st.markdown("###")
53
+ submitted = st.form_submit_button(label="\n\nRun BAT\n\n")
 
54
 
55
+ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
56
+ with st.form("my_form_1"):
57
  corr_type = st.selectbox(
58
  label="Select Correlation type", options=["kendall", "pearson"], index=0
59
  )
60
 
61
+ aggragate_scenario_whitelist = aggragate_scenarios
62
+ # [
63
+ # scen
64
+ # for scen in all_scenarios_for_aggragate
65
+ # if scen not in aggragate_scenarios
66
+ # ]
67
 
68
  model_select_strategy = st.selectbox(
69
  label="Select strategy",
 
84
 
85
  submitted = st.form_submit_button(label="Run BAT")
86
 
87
+ with st.expander("Add your benchmarks here!", icon="πŸ”₯"):
88
+ uploaded_file = st.file_uploader("Add your benchmark as a CSV")
89
+ st.download_button(
90
+ label="Download example CSV",
91
+ data=pd.read_csv("assets/mybench_240901.csv")
92
+ .to_csv(index=False)
93
+ .encode("utf-8"),
94
+ file_name="mybench_240901.csv",
95
+ mime="text/csv",
96
+ )
97
 
98
+ my_benchmark = Benchmark()
99
+ if uploaded_file is not None:
100
+ df = pd.read_csv(uploaded_file)
101
+ my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 
 
 
 
 
 
 
 
102
 
103
 
104
  def run_load(
105
+ aggregate_scenario_whitelist,
106
  n_models_taken_list=[5],
107
  model_select_strategy_list=["random"],
108
  corr_types=["kendall"],
 
112
  ):
113
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
114
  input_str = (
115
+ str(aggregate_scenario_whitelist)
116
  + str(n_models_taken_list)
117
  + str(model_select_strategy_list)
118
  + str(corr_types)
 
155
  n_exps=n_exps if n_models_taken_list != [0] else 1,
156
  )
157
 
158
+ # holistic = Benchmark()
159
+ # holistic.load_local_catalog()
160
+ # holistic.df = holistic.df.query("scenario in @holistic_scenarios")
161
+
162
+ # holistic.clear_repeated_scenarios()
163
 
164
+ # aggragate_scores = holistic.df.query('scenario=="aggregate"')[
165
+ # ["model", "score"]
166
+ # ].sort_values(by="score", ascending=False)
167
+
168
+ allbench = Benchmark()
169
+ allbench.load_local_catalog()
170
+
171
+ allbench.add_aggregate(
172
  new_col_name="aggregate",
173
+ agg_source_name="aggregate",
174
+ scenario_whitelist=aggregate_scenario_whitelist,
175
+ min_scenario_for_models_to_appear_in_agg=1,
176
  )
177
 
178
+ aggragate_scores = allbench.df.query('scenario=="aggregate"')[
179
  ["model", "score"]
180
  ].sort_values(by="score", ascending=False)
181
 
 
 
 
182
  # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
183
 
184
  allbench.extend(my_benchmark)
 
186
  allbench.clear_repeated_scenarios()
187
 
188
  # removing and adding the holistic scenarios
189
+ # allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
190
+ # allbench = allbench.extend(holistic)
191
 
192
  tester = Tester(cfg=cfg)
193
 
 
205
 
206
 
207
  agreements, aggragare_score_df = run_load(
208
+ aggregate_scenario_whitelist=aggragate_scenario_whitelist,
209
  n_models_taken_list=n_models_taken_list,
210
  model_select_strategy_list=[model_select_strategy],
211
  corr_types=[corr_type],
 
224
  z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
225
  # z_scores["n_models_of_corr_with_agg"] = z_scores["n_models_of_corr_with_agg"].round(1)
226
 
227
+ z_scores["date"] = z_scores["source"].apply(
228
+ lambda x: x.split(".csv")[0].split("_")[-1]
229
+ if "frozen" not in x
230
+ else x.split(".csv")[0].split("_")[-2]
231
+ )
232
 
 
233
 
234
+ # print(z_scores["scenario"].unique().tolist())
235
 
236
+ # z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
237
+ z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
238
+ # , format="%y%m%d"
239
  data = (
240
  z_scores.rename(
241
  columns={
 
276
  vmax=1,
277
  )
278
  .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
279
+ .set_properties(**{"text-align": "center"})
280
  )
281
 
282
+ cols_used = [
283
+ "Benchmark",
284
+ "Z Score",
285
+ corr_name,
286
+ "p-value of Corr.",
287
+ "Snapshot Date",
288
+ ]
289
  st.dataframe(
290
  data=styled_data,
291
+ column_order=cols_used,
 
 
 
 
 
 
292
  hide_index=True,
293
  use_container_width=True,
294
  height=500,
295
+ column_config={col: {"alignment": "center"} for col in cols_used},
296
  )
297
 
298
+
299
  aggragare_score_df.rename(
300
  columns={
301
  "model": "Model",
 
597
  plotted_scenario = st.selectbox(
598
  "Choose Benchmark to plot",
599
  benchmarks,
600
+ index=benchmarks.index("LMSys Arena"),
601
  )
602
 
603
 
assets/{mybench.csv β†’ mybench_240901.csv} RENAMED
File without changes