sheonhan commited on
Commit
a885f09
1 Parent(s): fe6d73e

style update

Browse files
Files changed (1) hide show
  1. app.py +92 -58
app.py CHANGED
@@ -15,7 +15,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
15
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
16
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
17
 
18
- repo=None
19
  if H4_TOKEN:
20
  print("pulling repo")
21
  # try:
@@ -24,7 +24,10 @@ if H4_TOKEN:
24
  # pass
25
 
26
  repo = Repository(
27
- local_dir="./evals/", clone_from=LMEH_REPO, use_auth_token=H4_TOKEN, repo_type="dataset"
 
 
 
28
  )
29
  repo.git_pull()
30
 
@@ -47,15 +50,33 @@ def load_results(model, benchmark, metric):
47
  return mean_acc, data["config"]["model_args"]
48
 
49
 
50
- COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthfulQA (0-shot) ⬆️"]
51
- TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  if not IS_PUBLIC:
54
  COLS.insert(2, "8bit")
55
  TYPES.insert(2, "bool")
56
 
57
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
58
- EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
 
 
59
  def get_leaderboard():
60
  if repo:
61
  print("pulling changes")
@@ -65,44 +86,49 @@ def get_leaderboard():
65
 
66
  if not IS_PUBLIC:
67
  gpt4_values = {
68
- "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
69
- "Revision":"tech report",
70
- "8bit":None,
71
- "Average ⬆️":84.3,
72
- "ARC (25-shot) ⬆️":96.3,
73
- "HellaSwag (10-shot) ⬆️":95.3,
74
- "MMLU (5-shot) ⬆️":86.4,
75
- "TruthfulQA (0-shot) ⬆️":59.0,
76
  }
77
  all_data.append(gpt4_values)
78
  gpt35_values = {
79
- "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
80
- "Revision":"tech report",
81
- "8bit":None,
82
- "Average ⬆️":71.9,
83
- "ARC (25-shot) ⬆️":85.2,
84
- "HellaSwag (10-shot) ⬆️":85.5,
85
- "MMLU (5-shot) ⬆️":70.0,
86
- "TruthfulQA (0-shot) ⬆️":47.0,
87
  }
88
  all_data.append(gpt35_values)
89
 
90
  dataframe = pd.DataFrame.from_records(all_data)
91
- dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
92
  print(dataframe)
93
  dataframe = dataframe[COLS]
94
  return dataframe
95
 
 
96
  def get_eval_table():
97
  if repo:
98
  print("pulling changes for eval")
99
  repo.git_pull()
100
- entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
 
 
 
 
101
  all_evals = []
102
 
103
  for entry in entries:
104
  print(entry)
105
- if ".json"in entry:
106
  file_path = os.path.join("evals/eval_requests", entry)
107
  with open(file_path) as fp:
108
  data = json.load(fp)
@@ -111,21 +137,23 @@ def get_eval_table():
111
  data["model"] = make_clickable_model(data["model"])
112
  data["revision"] = data.get("revision", "main")
113
 
114
-
115
  all_evals.append(data)
116
  else:
117
  # this is a folder
118
- sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
 
 
 
 
119
  for sub_entry in sub_entries:
120
  file_path = os.path.join("evals/eval_requests", entry, sub_entry)
121
  with open(file_path) as fp:
122
  data = json.load(fp)
123
 
124
- #data["# params"] = get_n_params(data["model"])
125
  data["model"] = make_clickable_model(data["model"])
126
  all_evals.append(data)
127
 
128
-
129
  dataframe = pd.DataFrame.from_records(all_evals)
130
  return dataframe[EVAL_COLS]
131
 
@@ -133,6 +161,7 @@ def get_eval_table():
133
  leaderboard = get_leaderboard()
134
  eval_queue = get_eval_table()
135
 
 
136
  def is_model_on_hub(model_name, revision) -> bool:
137
  try:
138
  config = AutoConfig.from_pretrained(model_name, revision=revision)
@@ -144,7 +173,6 @@ def is_model_on_hub(model_name, revision) -> bool:
144
  return False
145
 
146
 
147
-
148
  def add_new_eval(
149
  model: str,
150
  base_model: str,
@@ -157,15 +185,15 @@ def add_new_eval(
157
  if revision == "":
158
  revision = "main"
159
  if is_delta_weight and not is_model_on_hub(base_model, revision):
160
- error_message = f"Base model \"{base_model}\" was not found on hub!"
161
  print(error_message)
162
  return f"<p style='color: red; font-size: 18px; text-align: center;'>{error_message}</p>"
163
 
164
  if not is_model_on_hub(model, revision):
165
- error_message = f"Model \"{model}\"was not found on hub!"
166
  print(error_message)
167
  return f"<p style='color: red; font-size: 18px; text-align: center;'>{error_message}</p>"
168
-
169
  print("adding new eval")
170
 
171
  eval_entry = {
@@ -200,7 +228,7 @@ def add_new_eval(
200
  token=H4_TOKEN,
201
  repo_type="dataset",
202
  )
203
-
204
  success_message = "Your request has been submitted to the evaluation queue!"
205
  return f"<p style='color: green; font-size: 18px; text-align: center;'>{success_message}</p>"
206
 
@@ -209,11 +237,11 @@ def refresh():
209
  return get_leaderboard(), get_eval_table()
210
 
211
 
212
-
213
  block = gr.Blocks()
214
  with block:
215
  with gr.Row():
216
- gr.Markdown(f"""
 
217
  # 🤗 Open LLM Leaderboard
218
  <font size="4">With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released. We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
219
 
@@ -224,27 +252,32 @@ Evaluation is performed against 4 popular benchmarks:
224
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
225
 
226
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
227
- """)
 
228
 
229
  with gr.Row():
230
- leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
231
- datatype=TYPES, max_rows=5)
232
-
233
-
234
 
235
  with gr.Row():
236
- gr.Markdown(f"""
 
237
  # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
238
 
239
- """)
 
240
  with gr.Accordion("Evaluation Queue", open=False):
241
  with gr.Row():
242
- eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
243
- datatype=EVAL_TYPES, max_rows=5)
 
244
 
245
  with gr.Row():
246
  refresh_button = gr.Button("Refresh")
247
- refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
 
 
248
 
249
  with gr.Accordion("Submit a new model for evaluation"):
250
  with gr.Row():
@@ -253,7 +286,9 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
253
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
254
 
255
  with gr.Column():
256
- is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
 
 
257
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
258
  is_delta_weight = gr.Checkbox(False, label="Delta weights")
259
  base_model_name_textbox = gr.Textbox(label="base model (for delta)")
@@ -263,18 +298,17 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
263
  with gr.Row():
264
  submission_result = gr.Markdown()
265
  submit_button.click(
266
- add_new_eval,
267
- [
268
- model_name_textbox,
269
- base_model_name_textbox,
270
- revision_name_textbox,
271
- is_8bit_toggle,
272
- private,
273
- is_delta_weight,
274
- ],
275
- submission_result
276
- )
277
-
278
 
279
  block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
280
  block.launch()
 
15
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
16
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", None))
17
 
18
+ repo = None
19
  if H4_TOKEN:
20
  print("pulling repo")
21
  # try:
 
24
  # pass
25
 
26
  repo = Repository(
27
+ local_dir="./evals/",
28
+ clone_from=LMEH_REPO,
29
+ use_auth_token=H4_TOKEN,
30
+ repo_type="dataset",
31
  )
32
  repo.git_pull()
33
 
 
50
  return mean_acc, data["config"]["model_args"]
51
 
52
 
53
+ COLS = [
54
+ "Model",
55
+ "Revision",
56
+ "Average ⬆️",
57
+ "ARC (25-shot) ⬆️",
58
+ "HellaSwag (10-shot) ⬆️",
59
+ "MMLU (5-shot) ⬆️",
60
+ "TruthfulQA (0-shot) ⬆️",
61
+ ]
62
+ TYPES = [
63
+ "markdown",
64
+ "str",
65
+ "number",
66
+ "number",
67
+ "number",
68
+ "number",
69
+ "number",
70
+ ]
71
 
72
  if not IS_PUBLIC:
73
  COLS.insert(2, "8bit")
74
  TYPES.insert(2, "bool")
75
 
76
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
77
+ EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
78
+
79
+
80
  def get_leaderboard():
81
  if repo:
82
  print("pulling changes")
 
86
 
87
  if not IS_PUBLIC:
88
  gpt4_values = {
89
+ "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
90
+ "Revision": "tech report",
91
+ "8bit": None,
92
+ "Average ⬆️": 84.3,
93
+ "ARC (25-shot) ⬆️": 96.3,
94
+ "HellaSwag (10-shot) ⬆️": 95.3,
95
+ "MMLU (5-shot) ⬆️": 86.4,
96
+ "TruthfulQA (0-shot) ⬆️": 59.0,
97
  }
98
  all_data.append(gpt4_values)
99
  gpt35_values = {
100
+ "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
101
+ "Revision": "tech report",
102
+ "8bit": None,
103
+ "Average ⬆️": 71.9,
104
+ "ARC (25-shot) ⬆️": 85.2,
105
+ "HellaSwag (10-shot) ⬆️": 85.5,
106
+ "MMLU (5-shot) ⬆️": 70.0,
107
+ "TruthfulQA (0-shot) ⬆️": 47.0,
108
  }
109
  all_data.append(gpt35_values)
110
 
111
  dataframe = pd.DataFrame.from_records(all_data)
112
+ dataframe = dataframe.sort_values(by=["Average ⬆️"], ascending=False)
113
  print(dataframe)
114
  dataframe = dataframe[COLS]
115
  return dataframe
116
 
117
+
118
  def get_eval_table():
119
  if repo:
120
  print("pulling changes for eval")
121
  repo.git_pull()
122
+ entries = [
123
+ entry
124
+ for entry in os.listdir("evals/eval_requests")
125
+ if not entry.startswith(".")
126
+ ]
127
  all_evals = []
128
 
129
  for entry in entries:
130
  print(entry)
131
+ if ".json" in entry:
132
  file_path = os.path.join("evals/eval_requests", entry)
133
  with open(file_path) as fp:
134
  data = json.load(fp)
 
137
  data["model"] = make_clickable_model(data["model"])
138
  data["revision"] = data.get("revision", "main")
139
 
 
140
  all_evals.append(data)
141
  else:
142
  # this is a folder
143
+ sub_entries = [
144
+ e
145
+ for e in os.listdir(f"evals/eval_requests/{entry}")
146
+ if not e.startswith(".")
147
+ ]
148
  for sub_entry in sub_entries:
149
  file_path = os.path.join("evals/eval_requests", entry, sub_entry)
150
  with open(file_path) as fp:
151
  data = json.load(fp)
152
 
153
+ # data["# params"] = get_n_params(data["model"])
154
  data["model"] = make_clickable_model(data["model"])
155
  all_evals.append(data)
156
 
 
157
  dataframe = pd.DataFrame.from_records(all_evals)
158
  return dataframe[EVAL_COLS]
159
 
 
161
  leaderboard = get_leaderboard()
162
  eval_queue = get_eval_table()
163
 
164
+
165
  def is_model_on_hub(model_name, revision) -> bool:
166
  try:
167
  config = AutoConfig.from_pretrained(model_name, revision=revision)
 
173
  return False
174
 
175
 
 
176
  def add_new_eval(
177
  model: str,
178
  base_model: str,
 
185
  if revision == "":
186
  revision = "main"
187
  if is_delta_weight and not is_model_on_hub(base_model, revision):
188
+ error_message = f'Base model "{base_model}" was not found on hub!'
189
  print(error_message)
190
  return f"<p style='color: red; font-size: 18px; text-align: center;'>{error_message}</p>"
191
 
192
  if not is_model_on_hub(model, revision):
193
+ error_message = f'Model "{model}"was not found on hub!'
194
  print(error_message)
195
  return f"<p style='color: red; font-size: 18px; text-align: center;'>{error_message}</p>"
196
+
197
  print("adding new eval")
198
 
199
  eval_entry = {
 
228
  token=H4_TOKEN,
229
  repo_type="dataset",
230
  )
231
+
232
  success_message = "Your request has been submitted to the evaluation queue!"
233
  return f"<p style='color: green; font-size: 18px; text-align: center;'>{success_message}</p>"
234
 
 
237
  return get_leaderboard(), get_eval_table()
238
 
239
 
 
240
  block = gr.Blocks()
241
  with block:
242
  with gr.Row():
243
+ gr.Markdown(
244
+ f"""
245
  # 🤗 Open LLM Leaderboard
246
  <font size="4">With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released. We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
247
 
 
252
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
253
 
254
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
255
+ """
256
+ )
257
 
258
  with gr.Row():
259
+ leaderboard_table = gr.components.Dataframe(
260
+ value=leaderboard, headers=COLS, datatype=TYPES, max_rows=5
261
+ )
 
262
 
263
  with gr.Row():
264
+ gr.Markdown(
265
+ f"""
266
  # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
267
 
268
+ """
269
+ )
270
  with gr.Accordion("Evaluation Queue", open=False):
271
  with gr.Row():
272
+ eval_table = gr.components.Dataframe(
273
+ value=eval_queue, headers=EVAL_COLS, datatype=EVAL_TYPES, max_rows=5
274
+ )
275
 
276
  with gr.Row():
277
  refresh_button = gr.Button("Refresh")
278
+ refresh_button.click(
279
+ refresh, inputs=[], outputs=[leaderboard_table, eval_table]
280
+ )
281
 
282
  with gr.Accordion("Submit a new model for evaluation"):
283
  with gr.Row():
 
286
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
287
 
288
  with gr.Column():
289
+ is_8bit_toggle = gr.Checkbox(
290
+ False, label="8 bit eval", visible=not IS_PUBLIC
291
+ )
292
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
293
  is_delta_weight = gr.Checkbox(False, label="Delta weights")
294
  base_model_name_textbox = gr.Textbox(label="base model (for delta)")
 
298
  with gr.Row():
299
  submission_result = gr.Markdown()
300
  submit_button.click(
301
+ add_new_eval,
302
+ [
303
+ model_name_textbox,
304
+ base_model_name_textbox,
305
+ revision_name_textbox,
306
+ is_8bit_toggle,
307
+ private,
308
+ is_delta_weight,
309
+ ],
310
+ submission_result,
311
+ )
 
312
 
313
  block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
314
  block.launch()