weichiang commited on
Commit
bf3bf20
1 Parent(s): ce16823
Files changed (1) hide show
  1. app.py +3 -4
app.py CHANGED
@@ -26,7 +26,7 @@ def make_default_md(arena_df, elo_results):
26
  | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
 
28
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
- We've collected over **400,000** human preference votes to rank LLMs with the Elo ranking system.
30
  """
31
  return leaderboard_md
32
 
@@ -36,7 +36,7 @@ def make_arena_leaderboard_md(arena_df):
36
  total_models = len(arena_df)
37
 
38
  leaderboard_md = f"""
39
- Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March 26, 2024.
40
 
41
  Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
42
  """
@@ -46,7 +46,7 @@ Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! Find m
46
  def make_full_leaderboard_md(elo_results):
47
  leaderboard_md = f"""
48
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
49
- - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 200K+ user votes to compute Elo ratings.
50
  - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
51
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
52
 
@@ -210,7 +210,6 @@ def get_arena_table(arena_df, model_table_df):
210
  for i in range(len(arena_df)):
211
  row = []
212
  model_key = arena_df.index[i]
213
- print(model_key)
214
  model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
215
  0
216
  ]
 
26
  | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
 
28
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
+ We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
30
  """
31
  return leaderboard_md
32
 
 
36
  total_models = len(arena_df)
37
 
38
  leaderboard_md = f"""
39
+ Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March 29, 2024.
40
 
41
  Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
42
  """
 
46
  def make_full_leaderboard_md(elo_results):
47
  leaderboard_md = f"""
48
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
49
+ - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute Elo ratings.
50
  - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
51
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
52
 
 
210
  for i in range(len(arena_df)):
211
  row = []
212
  model_key = arena_df.index[i]
 
213
  model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
214
  0
215
  ]