lmzheng commited on
Commit
49e21e1
Β·
1 Parent(s): dcb1547

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -25,7 +25,7 @@ def make_leaderboard_md(elo_results):
25
  - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
26
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
27
 
28
- πŸ’» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
29
  """
30
  return leaderboard_md
31
 
@@ -233,6 +233,9 @@ Please note that you may see different orders from different ranking methods. Th
233
  "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
234
  )
235
  plot_4 = gr.Plot(p4, show_label=False)
 
 
 
236
  return [md_1, plot_1, plot_2, plot_3, plot_4]
237
 
238
  block_css = """
@@ -294,7 +297,6 @@ def build_demo(elo_results_file, leaderboard_table_file):
294
  leader_components = build_leaderboard_tab(
295
  elo_results_file, leaderboard_table_file
296
  )
297
- gr.Markdown(acknowledgment_md)
298
 
299
  return demo
300
 
 
25
  - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
26
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
27
 
28
+ πŸ’» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
29
  """
30
  return leaderboard_md
31
 
 
233
  "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
234
  )
235
  plot_4 = gr.Plot(p4, show_label=False)
236
+
237
+ gr.Markdown(acknowledgment_md)
238
+
239
  return [md_1, plot_1, plot_2, plot_3, plot_4]
240
 
241
  block_css = """
 
297
  leader_components = build_leaderboard_tab(
298
  elo_results_file, leaderboard_table_file
299
  )
 
300
 
301
  return demo
302