Lisa Dunlap commited on
Commit
e022a14
β€’
1 Parent(s): fc39491

updated with full category results

Browse files
Files changed (2) hide show
  1. app.py +91 -94
  2. elo_results_20240403.pkl +3 -0
app.py CHANGED
@@ -12,7 +12,6 @@ import pandas as pd
12
  # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
13
  notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
14
 
15
-
16
  basic_component_values = [None] * 6
17
  leader_component_values = [None] * 5
18
 
@@ -31,20 +30,25 @@ We've collected over **500,000** human preference votes to rank LLMs with the El
31
  return leaderboard_md
32
 
33
 
34
- def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
35
  total_votes = sum(arena_df["num_battles"]) // 2
36
  total_models = len(arena_df)
37
  space = "   "
38
- if arena_subset_df is not None:
39
- total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
40
- total_subset_models = len(arena_subset_df)
41
- vote_str = f"{space} {name} #models: **{total_subset_models}**.{space} {name} #votes: **{'{:,}'.format(total_subset_votes)}**."
42
- else:
43
- vote_str = ""
44
  leaderboard_md = f"""
45
- Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
 
 
 
 
46
 
47
- **NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
 
 
 
 
 
 
 
48
  """
49
  return leaderboard_md
50
 
@@ -279,19 +283,11 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
279
  print(f"{model_key} - {e}")
280
  return values
281
 
282
- def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_df, elo_subset_results):
283
- arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
284
- p1 = elo_subset_results["win_fraction_heatmap"]
285
- p2 = elo_subset_results["battle_count_heatmap"]
286
- p3 = elo_subset_results["bootstrap_elo_rating"]
287
- p4 = elo_subset_results["average_win_rate_bar"]
288
- more_stats_md = f"""## More Statistics for Chatbot Arena ({button})
289
- """
290
- leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
291
- return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
292
-
293
 
294
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
 
 
295
  if elo_results_file is None: # Do live update
296
  default_md = "Loading ..."
297
  p1 = p2 = p3 = p4 = None
@@ -299,25 +295,20 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
299
  with open(elo_results_file, "rb") as fin:
300
  elo_results = pickle.load(fin)
301
  if "full" in elo_results:
302
- elo_chinese_results = elo_results["chinese"]
303
- elo_long_results = elo_results["long"]
304
- elo_english_results = elo_results["english"]
305
- elo_coding_results = elo_results["coding"]
306
- elo_results = elo_results["full"]
307
-
308
- p1 = elo_results["win_fraction_heatmap"]
309
- p2 = elo_results["battle_count_heatmap"]
310
- p3 = elo_results["bootstrap_elo_rating"]
311
- p4 = elo_results["average_win_rate_bar"]
312
- arena_df = elo_results["leaderboard_table_df"]
313
- arena_chinese_df = elo_chinese_results["leaderboard_table_df"]
314
- arena_long_df = elo_long_results["leaderboard_table_df"]
315
- arena_english_df = elo_english_results["leaderboard_table_df"]
316
- arena_coding_df = elo_coding_results["leaderboard_table_df"]
317
- default_md = make_default_md(arena_df, elo_results)
318
 
319
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
320
- # md = make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df)
321
  if leaderboard_table_file:
322
  data = load_leaderboard_table_csv(leaderboard_table_file)
323
  model_table_df = pd.DataFrame(data)
@@ -329,20 +320,11 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
329
  md = make_arena_leaderboard_md(arena_df)
330
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
331
  with gr.Row():
332
- overall_rating = gr.Button("Overall")
333
- # update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
334
- update_overall_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, None, elo_results)
335
- coding_rating = gr.Button("Coding")
336
- update_coding_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_coding_df, elo_coding_results)
337
- long_context_rating = gr.Button("Long Conversation")
338
- update_long_context_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_long_df, elo_long_results)
339
- # update_long_context_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_long_df)
340
- english_rating = gr.Button("English")
341
- update_english_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_english_df, elo_english_results)
342
- # update_english_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_english_df)
343
- chinese_rating = gr.Button("Chinese")
344
- update_chinese_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_chinese_df, elo_chinese_results)
345
- # update_chinese_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_chinese_df)
346
  elo_display_df = gr.Dataframe(
347
  headers=[
348
  "Rank",
@@ -371,6 +353,44 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
371
  wrap=True,
372
  )
373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  with gr.Tab("Full Leaderboard", id=1):
375
  md = make_full_leaderboard_md(elo_results)
376
  gr.Markdown(md, elem_id="leaderboard_markdown")
@@ -401,49 +421,21 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
401
  else:
402
  pass
403
 
404
- gr.Markdown(
405
- f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
406
- A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
407
- See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
408
- """,
409
- elem_id="leaderboard_markdown"
410
- )
411
-
412
- leader_component_values[:] = [default_md, p1, p2, p3, p4]
413
-
414
- if show_plot:
415
- more_stats_md = gr.Markdown(
416
- f"""## More Statistics for Chatbot Arena (Overall)""",
417
- elem_id="leaderboard_header_markdown"
418
- )
419
- with gr.Row():
420
- with gr.Column():
421
- gr.Markdown(
422
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title"
423
- )
424
- plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
425
- with gr.Column():
426
- gr.Markdown(
427
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
428
- )
429
- plot_2 = gr.Plot(p2, show_label=False)
430
- with gr.Row():
431
- with gr.Column():
432
- gr.Markdown(
433
- "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
434
- )
435
- plot_3 = gr.Plot(p3, show_label=False)
436
- with gr.Column():
437
- gr.Markdown(
438
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
439
- )
440
- plot_4 = gr.Plot(p4, show_label=False)
441
 
442
- overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
443
- coding_rating.click(fn=update_coding_rating_df, inputs=coding_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
444
- long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
445
- english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
446
- chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
447
 
448
  with gr.Accordion(
449
  "πŸ“ Citation",
@@ -482,6 +474,11 @@ block_css = """
482
  padding-bottom: 6px;
483
  }
484
 
 
 
 
 
 
485
  #leaderboard_markdown {
486
  font-size: 104%
487
  }
 
12
  # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
13
  notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
14
 
 
15
  basic_component_values = [None] * 6
16
  leader_component_values = [None] * 5
17
 
 
30
  return leaderboard_md
31
 
32
 
33
+ def make_arena_leaderboard_md(arena_df):
34
  total_votes = sum(arena_df["num_battles"]) // 2
35
  total_models = len(arena_df)
36
  space = "   "
 
 
 
 
 
 
37
  leaderboard_md = f"""
38
+ Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: March 29, 2024.
39
+
40
+ **NEW!** View ELO leaderboard and stats for different input categories.
41
+ """
42
+ return leaderboard_md
43
 
44
+ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
45
+ total_votes = sum(arena_df["num_battles"]) // 2
46
+ total_models = len(arena_df)
47
+ space = "   "
48
+ total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
49
+ total_subset_models = len(arena_subset_df)
50
+ leaderboard_md = f"""### {name} Question Coverage
51
+ #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)**.{space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**.{space}
52
  """
53
  return leaderboard_md
54
 
 
283
  print(f"{model_key} - {e}")
284
  return values
285
 
286
+ key_to_category_name = {"full": "Total", "coding": "Coding", "long": "Long Conversation", "english": "English", "chinese": "Chinese"}
 
 
 
 
 
 
 
 
 
 
287
 
288
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
289
+ arena_dfs = {}
290
+ category_elo_results = {}
291
  if elo_results_file is None: # Do live update
292
  default_md = "Loading ..."
293
  p1 = p2 = p3 = p4 = None
 
295
  with open(elo_results_file, "rb") as fin:
296
  elo_results = pickle.load(fin)
297
  if "full" in elo_results:
298
+ print("KEYS ", elo_results.keys())
299
+ for k in elo_results.keys():
300
+ for k in key_to_category_name:
301
+ arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
302
+ category_elo_results[key_to_category_name[k]] = elo_results[k]
303
+
304
+ p1 = category_elo_results["Total"]["win_fraction_heatmap"]
305
+ p2 = category_elo_results["Total"]["battle_count_heatmap"]
306
+ p3 = category_elo_results["Total"]["bootstrap_elo_rating"]
307
+ p4 = category_elo_results["Total"]["average_win_rate_bar"]
308
+ arena_df = arena_dfs["Total"]
309
+ default_md = make_default_md(arena_df, category_elo_results["Total"])
 
 
 
 
310
 
311
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
 
312
  if leaderboard_table_file:
313
  data = load_leaderboard_table_csv(leaderboard_table_file)
314
  model_table_df = pd.DataFrame(data)
 
320
  md = make_arena_leaderboard_md(arena_df)
321
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
322
  with gr.Row():
323
+ category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Total")
324
+ default_category_details = make_category_arena_leaderboard_md(arena_df, arena_df, name="Toal")
325
+ with gr.Column(variant="panel"):
326
+ category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
327
+
 
 
 
 
 
 
 
 
 
328
  elo_display_df = gr.Dataframe(
329
  headers=[
330
  "Rank",
 
353
  wrap=True,
354
  )
355
 
356
+ gr.Markdown(
357
+ f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
358
+ A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
359
+ See Figure 3 below for visualization of the confidence intervals. Code to recreate these tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
360
+ """,
361
+ elem_id="leaderboard_markdown"
362
+ )
363
+
364
+ leader_component_values[:] = [default_md, p1, p2, p3, p4]
365
+
366
+ if show_plot:
367
+ more_stats_md = gr.Markdown(
368
+ f"""## More Statistics for Chatbot Arena (Overall)""",
369
+ elem_id="leaderboard_header_markdown"
370
+ )
371
+ with gr.Row():
372
+ with gr.Column():
373
+ gr.Markdown(
374
+ "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title"
375
+ )
376
+ plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
377
+ with gr.Column():
378
+ gr.Markdown(
379
+ "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
380
+ )
381
+ plot_2 = gr.Plot(p2, show_label=False)
382
+ with gr.Row():
383
+ with gr.Column():
384
+ gr.Markdown(
385
+ "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
386
+ )
387
+ plot_3 = gr.Plot(p3, show_label=False)
388
+ with gr.Column():
389
+ gr.Markdown(
390
+ "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
391
+ )
392
+ plot_4 = gr.Plot(p4, show_label=False)
393
+
394
  with gr.Tab("Full Leaderboard", id=1):
395
  md = make_full_leaderboard_md(elo_results)
396
  gr.Markdown(md, elem_id="leaderboard_markdown")
 
421
  else:
422
  pass
423
 
424
+ def update_leaderboard_and_plots(category):
425
+ arena_subset_df = arena_dfs[category]
426
+ elo_subset_results = category_elo_results[category]
427
+ arena_df = arena_dfs["Total"]
428
+ arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
429
+ p1 = elo_subset_results["win_fraction_heatmap"]
430
+ p2 = elo_subset_results["battle_count_heatmap"]
431
+ p3 = elo_subset_results["bootstrap_elo_rating"]
432
+ p4 = elo_subset_results["average_win_rate_bar"]
433
+ more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
434
+ """
435
+ leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
436
+ return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
+ category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
 
 
 
 
439
 
440
  with gr.Accordion(
441
  "πŸ“ Citation",
 
474
  padding-bottom: 6px;
475
  }
476
 
477
+ #category_deets {
478
+ text-align: center;
479
+ padding: 0px;
480
+ }
481
+
482
  #leaderboard_markdown {
483
  font-size: 104%
484
  }
elo_results_20240403.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8cebf41da8c06eee0f37156e01be83cc43182e0f00444311b4ad97a83154be
3
+ size 690286