Clémentine commited on
Commit
6e8f400
1 Parent(s): d52179b
app.py CHANGED
@@ -12,7 +12,6 @@ from transformers import AutoConfig
12
 
13
  from src.auto_leaderboard.get_model_metadata import apply_metadata
14
  from src.assets.text_content import *
15
- from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
16
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
17
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
18
  from src.assets.css_html_js import custom_css, get_window_url_params
@@ -22,8 +21,6 @@ from src.init import load_all_info_from_hub
22
  # clone / pull the lmeh eval data
23
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
24
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
25
- HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
26
- GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
27
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
28
  ADD_PLOTS = False
29
 
@@ -37,7 +34,7 @@ def restart_space():
37
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
38
  )
39
 
40
- auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
41
 
42
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
43
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -53,10 +50,6 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
53
 
54
  BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
55
 
56
- ELO_COLS = [c.name for c in fields(EloEvalColumn)]
57
- ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
58
- ELO_SORT_COL = EloEvalColumn.gpt4.name
59
-
60
 
61
  def has_no_nan_values(df, columns):
62
  return df[columns].notna().all(axis=1)
@@ -138,41 +131,6 @@ def get_evaluation_queue_df():
138
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
139
 
140
 
141
- def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
142
- if human_eval_repo:
143
- print("Pulling human_eval_repo changes")
144
- human_eval_repo.git_pull()
145
-
146
- all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
147
- dataframe = pd.DataFrame.from_records(all_data)
148
- dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
149
- dataframe = dataframe[ELO_COLS]
150
- return dataframe
151
-
152
-
153
- def get_elo_elements():
154
- df_instruct = pd.read_json("human_evals/without_code.json")
155
- df_code_instruct = pd.read_json("human_evals/with_code.json")
156
-
157
- elo_leaderboard = get_elo_leaderboard(
158
- df_instruct, df_code_instruct, tie_allowed=False
159
- )
160
- elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
161
- df_instruct, df_code_instruct, tie_allowed=True
162
- )
163
- plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
164
- df_instruct, df_code_instruct, tie_allowed=False
165
- )
166
-
167
- return (
168
- elo_leaderboard,
169
- elo_leaderboard_with_tie_allowed,
170
- plot_1,
171
- plot_2,
172
- plot_3,
173
- plot_4,
174
- )
175
-
176
 
177
  original_df = get_leaderboard_df()
178
  leaderboard_df = original_df.copy()
@@ -181,15 +139,6 @@ leaderboard_df = original_df.copy()
181
  running_eval_queue_df,
182
  pending_eval_queue_df,
183
  ) = get_evaluation_queue_df()
184
- (
185
- elo_leaderboard,
186
- elo_leaderboard_with_tie_allowed,
187
- plot_1,
188
- plot_2,
189
- plot_3,
190
- plot_4,
191
- ) = get_elo_elements()
192
-
193
 
194
  def is_model_on_hub(model_name, revision) -> bool:
195
  try:
@@ -305,188 +254,153 @@ def change_tab(query_param):
305
  demo = gr.Blocks(css=custom_css)
306
  with demo:
307
  gr.HTML(TITLE)
 
308
  with gr.Row():
309
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
310
-
311
- with gr.Row():
312
- with gr.Column():
313
- with gr.Accordion("📙 Citation", open=False):
314
- citation_button = gr.Textbox(
315
- value=CITATION_BUTTON_TEXT,
316
- label=CITATION_BUTTON_LABEL,
317
- elem_id="citation-button",
318
- ).style(show_copy_button=True)
319
- with gr.Column():
320
- with gr.Accordion("✨ CHANGELOG", open=False):
321
- changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
322
 
323
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
324
- with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table", id=0):
325
- with gr.Column():
326
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
327
- with gr.Box(elem_id="search-bar-table-box"):
328
- search_bar = gr.Textbox(
329
- placeholder="🔍 Search your model and press ENTER...",
330
- show_label=False,
331
- elem_id="search-bar",
332
- )
333
- with gr.Tabs(elem_classes="tab-buttons"):
334
- with gr.TabItem("Light View"):
335
- leaderboard_table_lite = gr.components.Dataframe(
336
- value=leaderboard_df[COLS_LITE],
337
- headers=COLS_LITE,
338
- datatype=TYPES_LITE,
339
- max_rows=None,
340
- elem_id="leaderboard-table-lite",
341
- )
342
- with gr.TabItem("Extended Model View"):
343
- leaderboard_table = gr.components.Dataframe(
344
- value=leaderboard_df,
345
- headers=COLS,
346
- datatype=TYPES,
347
- max_rows=None,
348
- elem_id="leaderboard-table",
349
- )
350
-
351
- # Dummy leaderboard for handling the case when the user uses backspace key
352
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
353
- value=original_df,
354
- headers=COLS,
355
- datatype=TYPES,
356
- max_rows=None,
357
- visible=False,
358
- )
359
- search_bar.submit(
360
- search_table,
361
- [hidden_leaderboard_table_for_search, search_bar],
362
- leaderboard_table,
363
- )
364
 
365
- # Dummy leaderboard for handling the case when the user uses backspace key
366
- hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
367
- value=original_df[COLS_LITE],
368
- headers=COLS_LITE,
369
- datatype=TYPES_LITE,
370
- max_rows=None,
371
- visible=False,
372
- )
373
- search_bar.submit(
374
- search_table,
375
- [hidden_leaderboard_table_for_search_lite, search_bar],
376
- leaderboard_table_lite,
377
- )
378
 
379
- with gr.Row():
380
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
381
-
382
- with gr.Accordion("✅ Finished Evaluations", open=False):
383
- with gr.Row():
384
- finished_eval_table = gr.components.Dataframe(
385
- value=finished_eval_queue_df,
386
- headers=EVAL_COLS,
387
- datatype=EVAL_TYPES,
388
- max_rows=5,
389
- )
390
- with gr.Accordion("🔄 Running Evaluation Queue", open=False):
391
- with gr.Row():
392
- running_eval_table = gr.components.Dataframe(
393
- value=running_eval_queue_df,
394
- headers=EVAL_COLS,
395
- datatype=EVAL_TYPES,
396
- max_rows=5,
397
- )
398
-
399
- with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
400
- with gr.Row():
401
- pending_eval_table = gr.components.Dataframe(
402
- value=pending_eval_queue_df,
403
- headers=EVAL_COLS,
404
- datatype=EVAL_TYPES,
405
- max_rows=5,
406
- )
407
 
 
 
 
 
 
 
408
  with gr.Row():
409
- refresh_button = gr.Button("Refresh")
410
- refresh_button.click(
411
- refresh,
412
- inputs=[],
413
- outputs=[
414
- leaderboard_table,
415
- finished_eval_table,
416
- running_eval_table,
417
- pending_eval_table,
418
- ],
419
  )
420
- with gr.Accordion("Submit a new model for evaluation"):
421
- with gr.Row():
422
- with gr.Column():
423
- model_name_textbox = gr.Textbox(label="Model name")
424
- revision_name_textbox = gr.Textbox(
425
- label="revision", placeholder="main"
426
- )
427
-
428
- with gr.Column():
429
- is_8bit_toggle = gr.Checkbox(
430
- False, label="8 bit eval", visible=not IS_PUBLIC
431
- )
432
- private = gr.Checkbox(
433
- False, label="Private", visible=not IS_PUBLIC
434
- )
435
- is_delta_weight = gr.Checkbox(False, label="Delta weights")
436
- base_model_name_textbox = gr.Textbox(
437
- label="base model (for delta)"
438
- )
439
-
440
- submit_button = gr.Button("Submit Eval")
441
- submission_result = gr.Markdown()
442
- submit_button.click(
443
- add_new_eval,
444
- [
445
- model_name_textbox,
446
- base_model_name_textbox,
447
- revision_name_textbox,
448
- is_8bit_toggle,
449
- private,
450
- is_delta_weight,
451
- ],
452
- submission_result,
453
  )
454
- with gr.TabItem(
455
- "🧑‍⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table", id=1
456
- ):
457
- with gr.Row():
458
- with gr.Column(scale=2):
459
- gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
460
- with gr.Column(scale=1):
461
- gr.Image(
462
- "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
463
  )
464
- gr.Markdown("## No tie allowed")
465
- elo_leaderboard_table = gr.components.Dataframe(
466
- value=elo_leaderboard,
467
- headers=ELO_COLS,
468
- datatype=ELO_TYPES,
469
- max_rows=5,
470
- )
471
 
472
- gr.Markdown("## Tie allowed*")
473
- elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
474
- value=elo_leaderboard_with_tie_allowed,
475
- headers=ELO_COLS,
476
- datatype=ELO_TYPES,
477
- max_rows=5,
 
 
 
 
 
478
  )
 
 
 
 
 
 
 
479
 
480
- gr.Markdown(
481
- "\* Results when the scores of 4 and 5 were treated as ties.",
482
- elem_classes="markdown-text",
483
- )
 
 
 
 
 
 
 
484
 
485
- gr.Markdown(
486
- "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
487
- elem_id="models-to-add-text",
 
 
 
 
 
 
 
 
 
 
488
  )
489
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  dummy = gr.Textbox(visible=False)
491
  demo.load(
492
  change_tab,
@@ -494,23 +408,6 @@ with demo:
494
  tabs,
495
  _js=get_window_url_params,
496
  )
497
- if ADD_PLOTS:
498
- with gr.Box():
499
- visualization_title = gr.HTML(VISUALIZATION_TITLE)
500
- with gr.Row():
501
- with gr.Column():
502
- gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
503
- plot_1 = gr.Plot(plot_1, show_label=False)
504
- with gr.Column():
505
- gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
506
- plot_2 = gr.Plot(plot_2, show_label=False)
507
- with gr.Row():
508
- with gr.Column():
509
- gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
510
- plot_3 = gr.Plot(plot_3, show_label=False)
511
- with gr.Column():
512
- gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
513
- plot_4 = gr.Plot(plot_4, show_label=False)
514
 
515
  scheduler = BackgroundScheduler()
516
  scheduler.add_job(restart_space, "interval", seconds=3600)
 
12
 
13
  from src.auto_leaderboard.get_model_metadata import apply_metadata
14
  from src.assets.text_content import *
 
15
  from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
  from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
  from src.assets.css_html_js import custom_css, get_window_url_params
 
21
  # clone / pull the lmeh eval data
22
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
23
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 
 
24
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
25
  ADD_PLOTS = False
26
 
 
34
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
35
  )
36
 
37
+ auto_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO)
38
 
39
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
40
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
50
 
51
  BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
52
 
 
 
 
 
53
 
54
  def has_no_nan_values(df, columns):
55
  return df[columns].notna().all(axis=1)
 
131
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
132
 
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  original_df = get_leaderboard_df()
136
  leaderboard_df = original_df.copy()
 
139
  running_eval_queue_df,
140
  pending_eval_queue_df,
141
  ) = get_evaluation_queue_df()
 
 
 
 
 
 
 
 
 
142
 
143
  def is_model_on_hub(model_name, revision) -> bool:
144
  try:
 
254
  demo = gr.Blocks(css=custom_css)
255
  with demo:
256
  gr.HTML(TITLE)
257
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
258
  with gr.Row():
259
+ with gr.Box(elem_id="search-bar-table-box"):
260
+ search_bar = gr.Textbox(
261
+ placeholder="🔍 Search your model and press ENTER...",
262
+ show_label=False,
263
+ elem_id="search-bar",
264
+ )
 
 
 
 
 
 
 
265
 
266
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
267
+ with gr.TabItem("🏅 LLM Benchmark (lite)", elem_id="llm-benchmark-tab-table", id=0):
268
+ leaderboard_table_lite = gr.components.Dataframe(
269
+ value=leaderboard_df[COLS_LITE],
270
+ headers=COLS_LITE,
271
+ datatype=TYPES_LITE,
272
+ max_rows=None,
273
+ elem_id="leaderboard-table-lite",
274
+ )
275
+ # Dummy leaderboard for handling the case when the user uses backspace key
276
+ hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
277
+ value=original_df[COLS_LITE],
278
+ headers=COLS_LITE,
279
+ datatype=TYPES_LITE,
280
+ max_rows=None,
281
+ visible=False,
282
+ )
283
+ search_bar.submit(
284
+ search_table,
285
+ [hidden_leaderboard_table_for_search_lite, search_bar],
286
+ leaderboard_table_lite,
287
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ with gr.TabItem("📊 Extended view", elem_id="llm-benchmark-tab-table", id=1):
290
+ leaderboard_table = gr.components.Dataframe(
291
+ value=leaderboard_df,
292
+ headers=COLS,
293
+ datatype=TYPES,
294
+ max_rows=None,
295
+ elem_id="leaderboard-table",
296
+ )
 
 
 
 
 
297
 
298
+ # Dummy leaderboard for handling the case when the user uses backspace key
299
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
300
+ value=original_df,
301
+ headers=COLS,
302
+ datatype=TYPES,
303
+ max_rows=None,
304
+ visible=False,
305
+ )
306
+ search_bar.submit(
307
+ search_table,
308
+ [hidden_leaderboard_table_for_search, search_bar],
309
+ leaderboard_table,
310
+ )
311
+ with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
312
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ with gr.Column():
315
+ with gr.Row():
316
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
317
+
318
+ with gr.Column():
319
+ with gr.Accordion("✅ Finished Evaluations", open=False):
320
  with gr.Row():
321
+ finished_eval_table = gr.components.Dataframe(
322
+ value=finished_eval_queue_df,
323
+ headers=EVAL_COLS,
324
+ datatype=EVAL_TYPES,
325
+ max_rows=5,
 
 
 
 
 
326
  )
327
+ with gr.Accordion("🔄 Running Evaluation Queue", open=False):
328
+ with gr.Row():
329
+ running_eval_table = gr.components.Dataframe(
330
+ value=running_eval_queue_df,
331
+ headers=EVAL_COLS,
332
+ datatype=EVAL_TYPES,
333
+ max_rows=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  )
335
+
336
+ with gr.Accordion(" Pending Evaluation Queue", open=False):
337
+ with gr.Row():
338
+ pending_eval_table = gr.components.Dataframe(
339
+ value=pending_eval_queue_df,
340
+ headers=EVAL_COLS,
341
+ datatype=EVAL_TYPES,
342
+ max_rows=5,
 
343
  )
 
 
 
 
 
 
 
344
 
345
+ with gr.Row():
346
+ refresh_button = gr.Button("Refresh")
347
+ refresh_button.click(
348
+ refresh,
349
+ inputs=[],
350
+ outputs=[
351
+ leaderboard_table,
352
+ finished_eval_table,
353
+ running_eval_table,
354
+ pending_eval_table,
355
+ ],
356
  )
357
+ with gr.Accordion("Submit a new model for evaluation"):
358
+ with gr.Row():
359
+ with gr.Column():
360
+ model_name_textbox = gr.Textbox(label="Model name")
361
+ revision_name_textbox = gr.Textbox(
362
+ label="revision", placeholder="main"
363
+ )
364
 
365
+ with gr.Column():
366
+ is_8bit_toggle = gr.Checkbox(
367
+ False, label="8 bit eval", visible=not IS_PUBLIC
368
+ )
369
+ private = gr.Checkbox(
370
+ False, label="Private", visible=not IS_PUBLIC
371
+ )
372
+ is_delta_weight = gr.Checkbox(False, label="Delta weights")
373
+ base_model_name_textbox = gr.Textbox(
374
+ label="base model (for delta)"
375
+ )
376
 
377
+ submit_button = gr.Button("Submit Eval")
378
+ submission_result = gr.Markdown()
379
+ submit_button.click(
380
+ add_new_eval,
381
+ [
382
+ model_name_textbox,
383
+ base_model_name_textbox,
384
+ revision_name_textbox,
385
+ is_8bit_toggle,
386
+ private,
387
+ is_delta_weight,
388
+ ],
389
+ submission_result,
390
  )
391
 
392
+ with gr.Row():
393
+ with gr.Column():
394
+ with gr.Accordion("📙 Citation", open=False):
395
+ citation_button = gr.Textbox(
396
+ value=CITATION_BUTTON_TEXT,
397
+ label=CITATION_BUTTON_LABEL,
398
+ elem_id="citation-button",
399
+ ).style(show_copy_button=True)
400
+ with gr.Column():
401
+ with gr.Accordion("✨ CHANGELOG", open=False):
402
+ changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
403
+
404
  dummy = gr.Textbox(visible=False)
405
  demo.load(
406
  change_tab,
 
408
  tabs,
409
  _js=get_window_url_params,
410
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  scheduler = BackgroundScheduler()
413
  scheduler.add_job(restart_space, "interval", seconds=3600)
src/assets/text_content.py CHANGED
@@ -57,15 +57,16 @@ CHANGELOG_TEXT = f"""
57
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
58
 
59
  INTRODUCTION_TEXT = f"""
60
- 📐 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
61
 
62
- 🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
63
-
64
- 📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
65
  """
66
 
67
  LLM_BENCHMARKS_TEXT = f"""
68
- Evaluation is performed against 4 popular benchmarks:
 
 
 
69
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
70
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
71
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
@@ -74,17 +75,9 @@ Evaluation is performed against 4 popular benchmarks:
74
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
75
  """
76
 
77
- HUMAN_GPT_EVAL_TEXT = f"""
78
- Evaluation is performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts. The prompts cover tasks such as brainstorming, creative generation, commonsense reasoning, open question answering, summarization, and code generation. Comparisons are made by humans and a model on a 1-8 Likert scale, where the labeler is required to choose a preference each time. Using these preferences, we create bootstrapped Elo rankings.
79
-
80
- We collaborated with **Scale AI** to generate the completions using a professional data labeling workforce on their platform, [following the labeling instructions found here](https://docs.google.com/document/d/1c5-96Lj-UH4lzKjLvJ_MRQaVMjtoEXTYA4dvoAYVCHc/edit?usp=sharing). To understand the evaluation of popular models, we also had GPT-4 label the completions using this prompt.
81
-
82
- For more information on the calibration and initiation of these measurements, please refer to the [announcement blog post](https://huggingface.co/blog/llm-leaderboard). We would like to express our gratitude to **LMSYS** for providing a [useful notebook](https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5?usp=sharing) for computing Elo estimates and plots.
83
- """
84
-
85
-
86
  EVALUATION_QUEUE_TEXT = f"""
87
- # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
 
88
  """
89
 
90
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -153,14 +146,4 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
153
  eprint={2109.07958},
154
  archivePrefix={arXiv},
155
  primaryClass={cs.CL}
156
- }"""
157
-
158
- VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
159
-
160
- PLOT_1_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
161
-
162
- PLOT_2_TITLE = "Comparison Count of Each Combination of Models (not allowing ties)"
163
-
164
- PLOT_3_TITLE = "Elo Estimates with error bars (ties allowed)"
165
-
166
- PLOT_4_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
 
57
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
58
 
59
  INTRODUCTION_TEXT = f"""
60
+ 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
61
 
62
+ 🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
 
 
63
  """
64
 
65
  LLM_BENCHMARKS_TEXT = f"""
66
+ With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
67
+
68
+ 📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
69
+
70
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
71
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
72
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
 
75
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
76
  """
77
 
 
 
 
 
 
 
 
 
 
78
  EVALUATION_QUEUE_TEXT = f"""
79
+ # Evaluation Queue for the 🤗 Open LLM Leaderboard
80
+ These models will be automatically evaluated on the 🤗 cluster.
81
  """
82
 
83
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
146
  eprint={2109.07958},
147
  archivePrefix={arXiv},
148
  primaryClass={cs.CL}
149
+ }"""
 
 
 
 
 
 
 
 
 
 
src/elo_leaderboard/load_results.py DELETED
@@ -1,200 +0,0 @@
1
- from collections import defaultdict
2
- from dataclasses import dataclass
3
- from typing import Dict, List
4
-
5
- import numpy as np
6
- import pandas as pd
7
- from datasets import load_dataset
8
-
9
- from src.assets.text_content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
10
- from src.utils_display import make_clickable_model, EloEvalColumn
11
- from .visualizations import (
12
- get_bootstrap_result,
13
- switch_model_a_b,
14
- visualize_battle_count,
15
- visualize_bootstrap_scores,
16
- visualize_pairwise_win_fraction,
17
- visualize_rating_count,
18
- )
19
-
20
-
21
- @dataclass
22
- class EloEvalResult:
23
- model: str
24
- gpt_4_all: int
25
- human_all: int
26
- human_instruct: int
27
- human_code_instruct: int
28
- tie_allowed: bool
29
-
30
- def to_dict(self):
31
- base_model = f"{self.model}"
32
- data_dict = {}
33
- data_dict[EloEvalColumn.model.name] = make_clickable_model(base_model)
34
- data_dict[EloEvalColumn.gpt4.name] = self.gpt_4_all
35
- data_dict[EloEvalColumn.human_all.name] = self.human_all
36
- data_dict[EloEvalColumn.human_instruct.name] = self.human_instruct
37
- data_dict[EloEvalColumn.human_code_instruct.name] = self.human_code_instruct
38
-
39
- return data_dict
40
-
41
-
42
- def create_eval_df(df, tie_allowed):
43
- responses = []
44
- for _, row in df.iterrows():
45
- if row["status"] == "canceled":
46
- continue
47
-
48
- rating = row["response"]["annotations"]["Preference"]
49
- if rating == "NaN":
50
- continue
51
-
52
- scores = row["response"]["responses"]
53
- if any(s["Preference"] == "" for s in scores):
54
- continue
55
-
56
- response = {
57
- "id": row["task_id"],
58
- "prompt": row["params"]["templateVariables"]["prompt"],
59
- "model_a": row["params"]["templateVariables"]["modela"],
60
- "model_b": row["params"]["templateVariables"]["modelb"],
61
- "response_a": row["params"]["templateVariables"]["response1"],
62
- "response_b": row["params"]["templateVariables"]["response2"],
63
- "rating": int(rating),
64
- "ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)],
65
- }
66
-
67
- if tie_allowed:
68
- response["win"] = (
69
- "model_a"
70
- if response["rating"] < 4
71
- else "model_b"
72
- if response["rating"] > 5
73
- else "tie"
74
- )
75
- else:
76
- response["win"] = "model_a" if response["rating"] < 5 else "model_b"
77
-
78
- responses.append(response)
79
-
80
- return pd.DataFrame(responses)
81
-
82
-
83
- def create_eval_df_for_gpt(df, tie_allowed):
84
- responses = []
85
- for _, row in df.iterrows():
86
- response = {
87
- "id": row["review_id"],
88
- "prompt": row["question"],
89
- "model_a": row["model1"],
90
- "model_b": row["model2"],
91
- "response_a": row["answer1"],
92
- "response_b": row["answer2"],
93
- "rating": row["score"][0],
94
- }
95
-
96
- if tie_allowed:
97
- response["win"] = (
98
- "model_a"
99
- if response["rating"] < 4
100
- else "model_b"
101
- if response["rating"] > 5
102
- else "tie"
103
- )
104
- else:
105
- response["win"] = "model_a" if response["rating"] < 5 else "model_b"
106
-
107
- responses.append(response)
108
-
109
- return pd.DataFrame(responses)
110
-
111
-
112
- # Compute the Elo rating for each model
113
- def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000):
114
- rating = defaultdict(lambda: initial_rating)
115
-
116
- for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples():
117
- ra = rating[model_a]
118
- rb = rating[model_b]
119
- ea = 1 / (1 + base ** ((rb - ra) / scale))
120
- eb = 1 / (1 + base ** ((ra - rb) / scale))
121
- if win == "model_a":
122
- sa = 1
123
- elif win == "model_b":
124
- sa = 0
125
- elif win == "tie" or win == "tie (bothbad)":
126
- sa = 0.5
127
- else:
128
- raise Exception(f"unexpected vote {win}")
129
- rating[model_a] += k * (sa - ea)
130
- rating[model_b] += k * (1 - sa - eb)
131
-
132
- return rating
133
-
134
-
135
- def convert_rating_from_float_to_int(df):
136
- return {model: int(rating) for model, rating in compute_elo(df).items()}
137
-
138
-
139
- def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
140
- df_all = pd.concat([df_instruct, df_code_instruct])
141
-
142
- df_gpt_4 = load_dataset(
143
- "gpt_4_evals/data/",
144
- split="train",
145
- revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
146
- ).to_pandas()
147
-
148
- dfs = [df_instruct, df_code_instruct, df_all]
149
- elo_ratings = [
150
- convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
151
- for df in dfs
152
- ]
153
-
154
- gpt_4_elo_ratings = convert_rating_from_float_to_int(
155
- create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
156
- )
157
- elo_ratings.append(gpt_4_elo_ratings)
158
-
159
- results = [
160
- EloEvalResult(
161
- model=model_name,
162
- gpt_4_all=elo_ratings[3][model_name],
163
- human_all=elo_ratings[2][model_name],
164
- human_instruct=elo_ratings[0][model_name],
165
- human_code_instruct=elo_ratings[1][model_name],
166
- tie_allowed=tie_allowed,
167
- )
168
- for model_name in elo_ratings[0].keys()
169
- ]
170
-
171
- return results
172
-
173
-
174
- def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]:
175
- eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed)
176
- return [r.to_dict() for r in eval_results]
177
-
178
-
179
- def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
180
- df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed)
181
- df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed)
182
- df_all = pd.concat([df_instruct, df_code_instruct])
183
- game = df_all[["model_a", "model_b", "win"]]
184
-
185
- game_switch = switch_model_a_b(game)
186
- plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE)
187
-
188
- plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE)
189
-
190
- BOOTSTRAP_ROUNDS = 1000
191
- if "bootstrap_elo_lu" not in globals():
192
- bootstrap_elo_lu = get_bootstrap_result(
193
- game_switch, compute_elo, BOOTSTRAP_ROUNDS
194
- )
195
-
196
- plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
197
-
198
- plot_4 = visualize_rating_count(game, PLOT_4_TITLE)
199
-
200
- return plot_1, plot_2, plot_3, plot_4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/elo_leaderboard/visualizations.py DELETED
@@ -1,137 +0,0 @@
1
- import math
2
-
3
- import numpy as np
4
- import pandas as pd
5
- import plotly.express as px
6
-
7
-
8
- # 1
9
- def compute_pairwise_win_fraction(battles):
10
- # Times each model wins as Model A
11
- a_win_ptbl = pd.pivot_table(
12
- battles[battles["win"] == "model_a"],
13
- index="model_a",
14
- columns="model_b",
15
- aggfunc="size",
16
- fill_value=0,
17
- )
18
-
19
- # Table counting times each model wins as Model B
20
- b_win_ptbl = pd.pivot_table(
21
- battles[battles["win"] == "model_b"],
22
- index="model_a",
23
- columns="model_b",
24
- aggfunc="size",
25
- fill_value=0,
26
- )
27
-
28
- # Table counting number of A-B pairs
29
- num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
30
-
31
- # Computing the proportion of wins for each model as A and as B
32
- # against all other models
33
- row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)
34
-
35
- # Arrange ordering according to proprition of wins
36
- prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
37
- model_names = list(prop_wins.keys())
38
- row_beats_col = row_beats_col_freq.loc[model_names, model_names]
39
- return row_beats_col
40
-
41
-
42
- def visualize_pairwise_win_fraction(battles, title):
43
- row_beats_col = compute_pairwise_win_fraction(battles)
44
- fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
45
- fig.update_layout(
46
- xaxis_title="Model B",
47
- yaxis_title="Model A",
48
- xaxis_side="top",
49
- title_y=0.07,
50
- title_x=0.5,
51
- )
52
- fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
53
- return fig
54
-
55
-
56
- # 2
57
- def switch_model_a_b(df):
58
- df_switch = df.copy()
59
- # switch with probability 0.5
60
- for i, row in df.iterrows():
61
- if np.random.rand() < 0.5:
62
- df_switch.at[i, "model_a"] = row["model_b"]
63
- df_switch.at[i, "model_b"] = row["model_a"]
64
- if row["win"] == "model_a":
65
- df_switch.at[i, "win"] = "model_b"
66
- elif row["win"] == "model_b":
67
- df_switch.at[i, "win"] = "model_a"
68
- return df_switch
69
-
70
-
71
- def visualize_battle_count(battles, title):
72
- ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
73
- battle_counts = ptbl + ptbl.T
74
- ordering = battle_counts.sum().sort_values(ascending=False).index
75
- fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
76
- fig.update_layout(
77
- xaxis_title="Model B",
78
- yaxis_title="Model A",
79
- xaxis_side="top",
80
- title_y=0.07,
81
- title_x=0.5,
82
- )
83
- fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
84
- return fig
85
-
86
-
87
- # 3
88
- def get_bootstrap_result(battles, func_compute_elo, num_round):
89
- rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
90
- df = pd.DataFrame(rows)
91
- return df[df.median().sort_values(ascending=False).index]
92
-
93
-
94
- def visualize_bootstrap_scores(df, title):
95
- bars = (
96
- pd.DataFrame(
97
- dict(
98
- lower=df.quantile(0.025),
99
- rating=df.quantile(0.5),
100
- upper=df.quantile(0.975),
101
- )
102
- )
103
- .reset_index(names="model")
104
- .sort_values("rating", ascending=False)
105
- )
106
- bars["error_y"] = bars["upper"] - bars["rating"]
107
- bars["error_y_minus"] = bars["rating"] - bars["lower"]
108
- bars["rating_rounded"] = np.round(bars["rating"], 2)
109
- fig = px.scatter(
110
- bars,
111
- x="model",
112
- y="rating",
113
- error_y="error_y",
114
- error_y_minus="error_y_minus",
115
- text="rating_rounded",
116
- title=title,
117
- )
118
- fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
119
- return fig
120
-
121
-
122
- # 4
123
- def visualize_rating_count(df, title):
124
- df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
125
- fig = px.bar(df_all_value_counts, title=title, text_auto=True)
126
-
127
- min_y = df_all_value_counts.min()
128
- max_y = df_all_value_counts.max()
129
-
130
- y_end = math.ceil(min_y / 100) * 100
131
- y_begin = math.floor(max_y / 100) * 100
132
-
133
- fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
134
- fig.update_yaxes(range=[y_begin, y_end])
135
- # save the plot for the blog:
136
- fig.write_html("src/assets/model_counts.html", full_html=False, include_plotlyjs="cdn")
137
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/init.py CHANGED
@@ -15,15 +15,11 @@ def get_all_requested_models(requested_models_dir):
15
 
16
  return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
 
18
- def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
19
  auto_eval_repo = None
20
  requested_models = None
21
  if H4_TOKEN:
22
  print("Pulling evaluation requests and results.")
23
- # try:
24
- # shutil.rmtree("./auto_evals/")
25
- # except:
26
- # pass
27
 
28
  auto_eval_repo = Repository(
29
  local_dir="./auto_evals/",
@@ -36,29 +32,7 @@ def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
36
  requested_models_dir = "./auto_evals/eval_requests"
37
  requested_models = get_all_requested_models(requested_models_dir)
38
 
39
- human_eval_repo = None
40
- if H4_TOKEN and not os.path.isdir("./human_evals"):
41
- print("Pulling human evaluation repo")
42
- human_eval_repo = Repository(
43
- local_dir="./human_evals/",
44
- clone_from=HUMAN_EVAL_REPO,
45
- use_auth_token=H4_TOKEN,
46
- repo_type="dataset",
47
- )
48
- human_eval_repo.git_pull()
49
-
50
- gpt_4_eval_repo = None
51
- if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
52
- print("Pulling GPT-4 evaluation repo")
53
- gpt_4_eval_repo = Repository(
54
- local_dir="./gpt_4_evals/",
55
- clone_from=GPT_4_EVAL_REPO,
56
- use_auth_token=H4_TOKEN,
57
- repo_type="dataset",
58
- )
59
- gpt_4_eval_repo.git_pull()
60
-
61
- return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
62
 
63
 
64
  #def load_results(model, benchmark, metric):
 
15
 
16
  return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
 
18
+ def load_all_info_from_hub(LMEH_REPO):
19
  auto_eval_repo = None
20
  requested_models = None
21
  if H4_TOKEN:
22
  print("Pulling evaluation requests and results.")
 
 
 
 
23
 
24
  auto_eval_repo = Repository(
25
  local_dir="./auto_evals/",
 
32
  requested_models_dir = "./auto_evals/eval_requests"
33
  requested_models = get_all_requested_models(requested_models_dir)
34
 
35
+ return auto_eval_repo, requested_models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  #def load_results(model, benchmark, metric):