margsli commited on
Commit
95307c3
β€’
1 Parent(s): 75dff90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -109
app.py CHANGED
@@ -9,23 +9,17 @@ import numpy as np
9
  import pandas as pd
10
 
11
 
12
- # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
13
- notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
14
-
15
  basic_component_values = [None] * 6
16
  leader_component_values = [None] * 5
17
 
18
-
19
  def make_default_md(arena_df, elo_results):
20
  total_votes = sum(arena_df["num_battles"]) // 2
21
  total_models = len(arena_df)
22
 
23
  leaderboard_md = f"""
24
- # πŸ† LMSYS Chatbot Arena Leaderboard
25
- | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
26
 
27
- LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
28
- We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
29
  """
30
  return leaderboard_md
31
 
@@ -35,11 +29,8 @@ def make_arena_leaderboard_md(arena_df):
35
  total_models = len(arena_df)
36
  space = "   "
37
  leaderboard_md = f"""
38
- Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April 11, 2024.
39
 
40
- πŸ“£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
41
-
42
- Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). Cast your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
43
  """
44
  return leaderboard_md
45
 
@@ -56,14 +47,9 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"
56
 
57
  def make_full_leaderboard_md(elo_results):
58
  leaderboard_md = f"""
59
- Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
60
- - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute Elo ratings.
61
- - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
62
- - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
63
-
64
- πŸ’» Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
65
- The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
66
- Higher values are better for all benchmarks. Empty cells mean not available.
67
  """
68
  return leaderboard_md
69
 
@@ -334,13 +320,11 @@ cat_name_to_explanation = {
334
  "Exclude Short": "User Query >= 5 tokens",
335
  }
336
 
337
-
338
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
339
  arena_dfs = {}
340
  category_elo_results = {}
341
  if elo_results_file is None: # Do live update
342
  default_md = "Loading ..."
343
- p1 = p2 = p3 = p4 = None
344
  else:
345
  with open(elo_results_file, "rb") as fin:
346
  elo_results = pickle.load(fin)
@@ -352,10 +336,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
352
  arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
353
  category_elo_results[key_to_category_name[k]] = elo_results[k]
354
 
355
- p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
356
- p2 = category_elo_results["Overall"]["battle_count_heatmap"]
357
- p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
358
- p4 = category_elo_results["Overall"]["average_win_rate_bar"]
359
  arena_df = arena_dfs["Overall"]
360
  default_md = make_default_md(arena_df, category_elo_results["Overall"])
361
 
@@ -382,8 +362,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
382
  "Rank",
383
  "πŸ€– Model",
384
  "⭐ Arena Elo",
385
- "πŸ“Š 95% CI",
386
- "πŸ—³οΈ Votes",
387
  "Organization",
388
  "License",
389
  "Knowledge Cutoff",
@@ -393,55 +371,23 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
393
  "markdown",
394
  "number",
395
  "str",
396
- "number",
397
- "str",
398
  "str",
399
  "str",
400
  ],
401
  value=arena_table_vals,
402
  elem_id="arena_leaderboard_dataframe",
403
  height=700,
404
- column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
405
  wrap=True,
406
  )
407
 
408
  gr.Markdown(
409
- f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
410
- A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
411
- See Figure 3 below for visualization of the confidence intervals. More details in [notebook]({notebook_url}).
412
  """,
413
  elem_id="leaderboard_markdown"
414
  )
415
 
416
- leader_component_values[:] = [default_md, p1, p2, p3, p4]
417
-
418
- if show_plot:
419
- more_stats_md = gr.Markdown(
420
- f"""## More Statistics for Chatbot Arena (Overall)""",
421
- elem_id="leaderboard_header_markdown"
422
- )
423
- with gr.Row():
424
- with gr.Column():
425
- gr.Markdown(
426
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title"
427
- )
428
- plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
429
- with gr.Column():
430
- gr.Markdown(
431
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
432
- )
433
- plot_2 = gr.Plot(p2, show_label=False)
434
- with gr.Row():
435
- with gr.Column():
436
- gr.Markdown(
437
- "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
438
- )
439
- plot_3 = gr.Plot(p3, show_label=False)
440
- with gr.Column():
441
- gr.Markdown(
442
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
443
- )
444
- plot_4 = gr.Plot(p4, show_label=False)
445
 
446
  with gr.Tab("Full Leaderboard", id=1):
447
  md = make_full_leaderboard_md(elo_results)
@@ -450,9 +396,9 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
450
  gr.Dataframe(
451
  headers=[
452
  "πŸ€– Model",
453
- "⭐ Arena Elo",
454
- "πŸ“ˆ MT-bench",
455
- "πŸ“š MMLU",
456
  "Organization",
457
  "License",
458
  ],
@@ -465,8 +411,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
465
  )
466
  if not show_plot:
467
  gr.Markdown(
468
- """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
469
- If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
470
  """,
471
  elem_id="leaderboard_markdown",
472
  )
@@ -474,7 +419,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
474
  pass
475
 
476
  def update_leaderboard_df(arena_table_vals):
477
- elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Delta", "πŸ€– Model", "⭐ Arena Elo", "πŸ“Š 95% CI", "πŸ—³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
478
 
479
  # goal: color the rows based on the rank with styler
480
  def highlight_max(s):
@@ -484,7 +429,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
484
  def highlight_rank_max(s):
485
  return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
486
 
487
- return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Delta"])
488
 
489
  def update_leaderboard_and_plots(category):
490
  arena_subset_df = arena_dfs[category]
@@ -497,30 +442,24 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
497
  arena_values = gr.Dataframe(
498
  headers=[
499
  "Rank",
500
- "Delta",
501
  "πŸ€– Model",
502
  "⭐ Arena Elo",
503
- "πŸ“Š 95% CI",
504
- "πŸ—³οΈ Votes",
505
  "Organization",
506
  "License",
507
  "Knowledge Cutoff",
508
  ],
509
  datatype=[
510
- "number",
511
  "number",
512
  "markdown",
513
  "number",
514
  "str",
515
- "number",
516
- "str",
517
  "str",
518
  "str",
519
  ],
520
  value=arena_values,
521
  elem_id="arena_leaderboard_dataframe",
522
  height=700,
523
- column_widths=[60, 70, 190, 110, 100, 90, 160, 150, 140],
524
  wrap=True,
525
  )
526
  else:
@@ -529,8 +468,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
529
  "Rank",
530
  "πŸ€– Model",
531
  "⭐ Arena Elo",
532
- "πŸ“Š 95% CI",
533
- "πŸ—³οΈ Votes",
534
  "Organization",
535
  "License",
536
  "Knowledge Cutoff",
@@ -540,28 +477,21 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
540
  "markdown",
541
  "number",
542
  "str",
543
- "number",
544
- "str",
545
  "str",
546
  "str",
547
  ],
548
  value=arena_values,
549
  elem_id="arena_leaderboard_dataframe",
550
  height=700,
551
- column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
552
  wrap=True,
553
  )
554
 
555
- p1 = elo_subset_results["win_fraction_heatmap"]
556
- p2 = elo_subset_results["battle_count_heatmap"]
557
- p3 = elo_subset_results["bootstrap_elo_rating"]
558
- p4 = elo_subset_results["average_win_rate_bar"]
559
- more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
560
- """
561
  leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
562
- return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
563
 
564
- category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
565
 
566
  with gr.Accordion(
567
  "πŸ“ Citation",
@@ -569,22 +499,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
569
  ):
570
  citation_md = """
571
  ### Citation
572
- Please cite the following paper if you find our leaderboard or dataset helpful.
573
- ```
574
- @misc{chiang2024chatbot,
575
- title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
576
- author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
577
- year={2024},
578
- eprint={2403.04132},
579
- archivePrefix={arXiv},
580
- primaryClass={cs.AI}
581
- }
582
  """
583
  gr.Markdown(citation_md, elem_id="leaderboard_markdown")
584
  gr.Markdown(acknowledgment_md)
585
 
586
  if show_plot:
587
- return [md_1, plot_1, plot_2, plot_3, plot_4]
588
  return [md_1]
589
 
590
 
@@ -656,15 +578,9 @@ footer {
656
 
657
  acknowledgment_md = """
658
  ### Acknowledgment
659
- We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a16z](https://www.a16z.com/), [Together AI](https://www.together.ai/), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous [sponsorship](https://lmsys.org/donations/).
660
 
661
  <div class="sponsor-image-about">
662
- <img src="https://storage.googleapis.com/public-arena-asset/kaggle.png" alt="Kaggle">
663
- <img src="https://storage.googleapis.com/public-arena-asset/mbzuai.jpeg" alt="MBZUAI">
664
- <img src="https://storage.googleapis.com/public-arena-asset/a16z.jpeg" alt="a16z">
665
- <img src="https://storage.googleapis.com/public-arena-asset/together.png" alt="Together AI">
666
- <img src="https://storage.googleapis.com/public-arena-asset/anyscale.png" alt="AnyScale">
667
- <img src="https://storage.googleapis.com/public-arena-asset/huggingface.png" alt="HuggingFace">
668
  </div>
669
  """
670
 
@@ -674,7 +590,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
674
  theme.set(button_secondary_background_fill_hover="*primary_300",
675
  button_secondary_background_fill_hover_dark="*primary_700")
676
  with gr.Blocks(
677
- title="Chatbot Arena Leaderboard",
678
  theme=theme,
679
  # theme = gr.themes.Base.load("theme.json"), # uncomment to use new cool theme
680
  css=block_css,
 
9
  import pandas as pd
10
 
11
 
 
 
 
12
  basic_component_values = [None] * 6
13
  leader_component_values = [None] * 5
14
 
 
15
  def make_default_md(arena_df, elo_results):
16
  total_votes = sum(arena_df["num_battles"]) // 2
17
  total_models = len(arena_df)
18
 
19
  leaderboard_md = f"""
20
+ # NeurIPS LLM Merging Competition Leaderboard
21
+ [Website]() | [GitHub]() | [Discord]() |
22
 
 
 
23
  """
24
  return leaderboard_md
25
 
 
29
  total_models = len(arena_df)
30
  space = "&nbsp;&nbsp;&nbsp;"
31
  leaderboard_md = f"""
32
+ Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: June 1, 2024.
33
 
 
 
 
34
  """
35
  return leaderboard_md
36
 
 
47
 
48
  def make_full_leaderboard_md(elo_results):
49
  leaderboard_md = f"""
50
+ Three benchmarks are displayed: **Task 1**, **Task 2**, **Task 3**.
51
+
52
+ Higher values are better for all benchmarks.
 
 
 
 
 
53
  """
54
  return leaderboard_md
55
 
 
320
  "Exclude Short": "User Query >= 5 tokens",
321
  }
322
 
 
323
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
324
  arena_dfs = {}
325
  category_elo_results = {}
326
  if elo_results_file is None: # Do live update
327
  default_md = "Loading ..."
 
328
  else:
329
  with open(elo_results_file, "rb") as fin:
330
  elo_results = pickle.load(fin)
 
336
  arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
337
  category_elo_results[key_to_category_name[k]] = elo_results[k]
338
 
 
 
 
 
339
  arena_df = arena_dfs["Overall"]
340
  default_md = make_default_md(arena_df, category_elo_results["Overall"])
341
 
 
362
  "Rank",
363
  "πŸ€– Model",
364
  "⭐ Arena Elo",
 
 
365
  "Organization",
366
  "License",
367
  "Knowledge Cutoff",
 
371
  "markdown",
372
  "number",
373
  "str",
 
 
374
  "str",
375
  "str",
376
  ],
377
  value=arena_table_vals,
378
  elem_id="arena_leaderboard_dataframe",
379
  height=700,
380
+ column_widths=[70, 190, 110, 160, 150, 140],
381
  wrap=True,
382
  )
383
 
384
  gr.Markdown(
385
+ f"""Note: .
 
 
386
  """,
387
  elem_id="leaderboard_markdown"
388
  )
389
 
390
+ leader_component_values[:] = [default_md]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
  with gr.Tab("Full Leaderboard", id=1):
393
  md = make_full_leaderboard_md(elo_results)
 
396
  gr.Dataframe(
397
  headers=[
398
  "πŸ€– Model",
399
+ "⭐ Task 1",
400
+ "πŸ“ˆ Task 2",
401
+ "πŸ“š Task 3",
402
  "Organization",
403
  "License",
404
  ],
 
411
  )
412
  if not show_plot:
413
  gr.Markdown(
414
+ """ ## Submti your model [here]().
 
415
  """,
416
  elem_id="leaderboard_markdown",
417
  )
 
419
  pass
420
 
421
  def update_leaderboard_df(arena_table_vals):
422
+ elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "πŸ€– Model", "⭐ Arena Elo", "Organization", "License", "Knowledge Cutoff"])
423
 
424
  # goal: color the rows based on the rank with styler
425
  def highlight_max(s):
 
429
  def highlight_rank_max(s):
430
  return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
431
 
432
+ return elo_datarame.style.apply(highlight_max, subset=["Rank"])
433
 
434
  def update_leaderboard_and_plots(category):
435
  arena_subset_df = arena_dfs[category]
 
442
  arena_values = gr.Dataframe(
443
  headers=[
444
  "Rank",
 
445
  "πŸ€– Model",
446
  "⭐ Arena Elo",
 
 
447
  "Organization",
448
  "License",
449
  "Knowledge Cutoff",
450
  ],
451
  datatype=[
 
452
  "number",
453
  "markdown",
454
  "number",
455
  "str",
 
 
456
  "str",
457
  "str",
458
  ],
459
  value=arena_values,
460
  elem_id="arena_leaderboard_dataframe",
461
  height=700,
462
+ column_widths=[60, 190, 110, 160, 150, 140],
463
  wrap=True,
464
  )
465
  else:
 
468
  "Rank",
469
  "πŸ€– Model",
470
  "⭐ Arena Elo",
 
 
471
  "Organization",
472
  "License",
473
  "Knowledge Cutoff",
 
477
  "markdown",
478
  "number",
479
  "str",
 
 
480
  "str",
481
  "str",
482
  ],
483
  value=arena_values,
484
  elem_id="arena_leaderboard_dataframe",
485
  height=700,
486
+ column_widths=[70, 190, 110, 160, 150, 140],
487
  wrap=True,
488
  )
489
 
490
+
 
 
 
 
 
491
  leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
492
+ return arena_values, more_stats_md, leaderboard_md
493
 
494
+ category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, more_stats_md, category_deets])
495
 
496
  with gr.Accordion(
497
  "πŸ“ Citation",
 
499
  ):
500
  citation_md = """
501
  ### Citation
502
+ Please cite the following paper
503
+
 
 
 
 
 
 
 
 
504
  """
505
  gr.Markdown(citation_md, elem_id="leaderboard_markdown")
506
  gr.Markdown(acknowledgment_md)
507
 
508
  if show_plot:
509
+ return [md_1]
510
  return [md_1]
511
 
512
 
 
578
 
579
  acknowledgment_md = """
580
  ### Acknowledgment
581
+ We thank []() for their generous [sponsorship]().
582
 
583
  <div class="sponsor-image-about">
 
 
 
 
 
 
584
  </div>
585
  """
586
 
 
590
  theme.set(button_secondary_background_fill_hover="*primary_300",
591
  button_secondary_background_fill_hover_dark="*primary_700")
592
  with gr.Blocks(
593
+ title="LLM Merging Leaderboard",
594
  theme=theme,
595
  # theme = gr.themes.Base.load("theme.json"), # uncomment to use new cool theme
596
  css=block_css,