sheonhan commited on
Commit
58733e4
β€’
1 Parent(s): 46f8d78

sync with the internal version

Browse files
Files changed (2) hide show
  1. app.py +8 -26
  2. content.py +23 -0
app.py CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
6
  import pandas as pd
7
 
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
- from content import CHANGELOG_TEXT
10
  from huggingface_hub import Repository, HfApi
11
  from transformers import AutoConfig
12
  from utils import get_eval_results_dicts, make_clickable_model
@@ -32,7 +32,6 @@ def get_all_requested_models(requested_models_dir):
32
 
33
  return set([file_name.lower().split("./evals/")[1] for file_name in file_names])
34
 
35
-
36
  repo = None
37
  requested_models = None
38
  if H4_TOKEN:
@@ -56,7 +55,6 @@ if H4_TOKEN:
56
 
57
  # parse the results
58
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
59
-
60
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
61
 
62
 
@@ -113,7 +111,6 @@ def has_no_nan_values(df, columns):
113
  def has_nan_values(df, columns):
114
  return df[columns].isna().any(axis=1)
115
 
116
-
117
  def get_leaderboard():
118
  if repo:
119
  print("pulling changes")
@@ -188,7 +185,6 @@ def get_eval_table():
188
  all_evals = []
189
 
190
  for entry in entries:
191
- # print(entry)
192
  if ".json" in entry:
193
  file_path = os.path.join("evals/eval_requests", entry)
194
  with open(file_path) as fp:
@@ -257,7 +253,6 @@ def add_new_eval(
257
 
258
  if not is_model_on_hub(model, revision):
259
  error_message = f'Model "{model}"was not found on hub!'
260
- print(error_message)
261
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error_message}</p>"
262
 
263
  print("adding new eval")
@@ -308,30 +303,21 @@ def refresh():
308
  finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
309
  return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
310
 
311
-
312
  custom_css = """
313
  #changelog-text {
314
  font-size: 18px !important;
315
  }
 
 
 
 
316
  """
317
 
318
  demo = gr.Blocks(css=custom_css)
319
  with demo:
 
320
  with gr.Row():
321
- gr.Markdown(
322
- f"""
323
- # πŸ€— Open LLM Leaderboard
324
- <font size="4">With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The πŸ€— Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released. We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the πŸ€— GPU cluster, as long as it is a πŸ€— Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
325
-
326
- Evaluation is performed against 4 popular benchmarks:
327
- - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
328
- - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
329
- - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
330
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
331
-
332
- We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
333
- """
334
- )
335
 
336
  with gr.Accordion("CHANGELOG", open=False):
337
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
@@ -342,12 +328,8 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
342
  )
343
 
344
  with gr.Row():
345
- gr.Markdown(
346
- f"""
347
- # Evaluation Queue for the πŸ€— Open LLM Leaderboard, these models will be automatically evaluated on the πŸ€— cluster
348
 
349
- """
350
- )
351
  with gr.Accordion("βœ… Finished Evaluations", open=False):
352
  with gr.Row():
353
  finished_eval_table = gr.components.Dataframe(
 
6
  import pandas as pd
7
 
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
+ from content import *
10
  from huggingface_hub import Repository, HfApi
11
  from transformers import AutoConfig
12
  from utils import get_eval_results_dicts, make_clickable_model
 
32
 
33
  return set([file_name.lower().split("./evals/")[1] for file_name in file_names])
34
 
 
35
  repo = None
36
  requested_models = None
37
  if H4_TOKEN:
 
55
 
56
  # parse the results
57
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
 
58
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
59
 
60
 
 
111
  def has_nan_values(df, columns):
112
  return df[columns].isna().any(axis=1)
113
 
 
114
  def get_leaderboard():
115
  if repo:
116
  print("pulling changes")
 
185
  all_evals = []
186
 
187
  for entry in entries:
 
188
  if ".json" in entry:
189
  file_path = os.path.join("evals/eval_requests", entry)
190
  with open(file_path) as fp:
 
253
 
254
  if not is_model_on_hub(model, revision):
255
  error_message = f'Model "{model}"was not found on hub!'
 
256
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error_message}</p>"
257
 
258
  print("adding new eval")
 
303
  finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
304
  return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
305
 
 
306
  custom_css = """
307
  #changelog-text {
308
  font-size: 18px !important;
309
  }
310
+
311
+ .markdown-text {
312
+ font-size: 16px !important;
313
+ }
314
  """
315
 
316
  demo = gr.Blocks(css=custom_css)
317
  with demo:
318
+ gr.HTML(TITLE)
319
  with gr.Row():
320
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  with gr.Accordion("CHANGELOG", open=False):
323
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
 
328
  )
329
 
330
  with gr.Row():
331
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
332
 
 
 
333
  with gr.Accordion("βœ… Finished Evaluations", open=False):
334
  with gr.Row():
335
  finished_eval_table = gr.components.Dataframe(
content.py CHANGED
@@ -1,6 +1,7 @@
1
  CHANGELOG_TEXT = f"""
2
  ## [2023-05-29]
3
  - Auto-restart every hour
 
4
 
5
  ## [2023-05-24]
6
  - Added a baseline that has 25.0 for all values.
@@ -24,3 +25,25 @@ CHANGELOG_TEXT = f"""
24
  ## [2023-05-10]
25
  - Released the leaderboard to public.
26
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  CHANGELOG_TEXT = f"""
2
  ## [2023-05-29]
3
  - Auto-restart every hour
4
+ - Sync with the internal version (minor style changes)
5
 
6
  ## [2023-05-24]
7
  - Added a baseline that has 25.0 for all values.
 
25
  ## [2023-05-10]
26
  - Released the leaderboard to public.
27
  """
28
+
29
+ TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
30
+
31
+ INTRODUCTION_TEXT = f"""
32
+ πŸ“ With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The πŸ€— Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
33
+
34
+ πŸ€— A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the πŸ€— GPU cluster, as long as it is a πŸ€— Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
35
+
36
+ πŸ“ˆ In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, TK.
37
+
38
+ Evaluation is performed against 4 popular benchmarks:
39
+ - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
40
+ - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
41
+ - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
42
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
43
+
44
+ We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
45
+ """
46
+
47
+ EVALUATION_QUEUE_TEXT = f"""
48
+ # Evaluation Queue for the πŸ€— Open LLM Leaderboard, these models will be automatically evaluated on the πŸ€— cluster
49
+ """