lewtun HF staff commited on
Commit
614ee1f
β€’
1 Parent(s): 7eee6bd

Fix TruthQA typo

Browse files
Files changed (2) hide show
  1. app.py +44 -46
  2. utils.py +1 -1
app.py CHANGED
@@ -43,11 +43,11 @@ def load_results(model, benchmark, metric):
43
  with open(file_path) as fp:
44
  data = json.load(fp)
45
  accs = np.array([v[metric] for k, v in data["results"].items()])
46
- mean_acc = np.mean(accs)
47
  return mean_acc, data["config"]["model_args"]
48
 
49
 
50
- COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
51
  TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
52
 
53
  if not IS_PUBLIC:
@@ -57,36 +57,36 @@ if not IS_PUBLIC:
57
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
58
  EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
59
  def get_leaderboard():
60
- if repo:
61
  print("pulling changes")
62
  repo.git_pull()
63
-
64
  all_data = get_eval_results_dicts(IS_PUBLIC)
65
-
66
  if not IS_PUBLIC:
67
  gpt4_values = {
68
- "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
69
- "Revision":"tech report",
70
  "8bit":None,
71
  "Average ⬆️":84.3,
72
  "ARC (25-shot) ⬆️":96.3,
73
  "HellaSwag (10-shot) ⬆️":95.3,
74
  "MMLU (5-shot) ⬆️":86.4,
75
- "TruthQA (0-shot) ⬆️":59.0,
76
  }
77
  all_data.append(gpt4_values)
78
  gpt35_values = {
79
- "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
80
- "Revision":"tech report",
81
  "8bit":None,
82
  "Average ⬆️":71.9,
83
  "ARC (25-shot) ⬆️":85.2,
84
  "HellaSwag (10-shot) ⬆️":85.5,
85
  "MMLU (5-shot) ⬆️":70.0,
86
- "TruthQA (0-shot) ⬆️":47.0,
87
  }
88
  all_data.append(gpt35_values)
89
-
90
  dataframe = pd.DataFrame.from_records(all_data)
91
  dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
92
  print(dataframe)
@@ -94,38 +94,38 @@ def get_leaderboard():
94
  return dataframe
95
 
96
  def get_eval_table():
97
- if repo:
98
  print("pulling changes for eval")
99
  repo.git_pull()
100
- entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
101
  all_evals = []
102
-
103
  for entry in entries:
104
  print(entry)
105
  if ".json"in entry:
106
  file_path = os.path.join("evals/eval_requests", entry)
107
  with open(file_path) as fp:
108
  data = json.load(fp)
109
-
110
  data["# params"] = "unknown"
111
  data["model"] = make_clickable_model(data["model"])
112
  data["revision"] = data.get("revision", "main")
113
-
114
 
115
  all_evals.append(data)
116
  else:
117
  # this is a folder
118
- sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
119
  for sub_entry in sub_entries:
120
  file_path = os.path.join("evals/eval_requests", entry, sub_entry)
121
  with open(file_path) as fp:
122
  data = json.load(fp)
123
-
124
  #data["# params"] = get_n_params(data["model"])
125
  data["model"] = make_clickable_model(data["model"])
126
  all_evals.append(data)
127
 
128
-
129
  dataframe = pd.DataFrame.from_records(all_evals)
130
  return dataframe[EVAL_COLS]
131
 
@@ -137,12 +137,12 @@ def is_model_on_hub(model_name, revision) -> bool:
137
  try:
138
  config = AutoConfig.from_pretrained(model_name, revision=revision)
139
  return True
140
-
141
  except Exception as e:
142
  print("Could not get the model config from the hub")
143
  print(e)
144
  return False
145
-
146
 
147
 
148
  def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
@@ -152,12 +152,12 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
152
  if is_delta_weight and not is_model_on_hub(base_model, revision):
153
  print(base_model, "base model not found on hub")
154
  return
155
-
156
  if not is_model_on_hub(model, revision):
157
  print(model, "not found on hub")
158
  return
159
  print("adding new eval")
160
-
161
  eval_entry = {
162
  "model" : model,
163
  "base_model" : base_model,
@@ -166,22 +166,22 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
166
  "8bit_eval" : is_8_bit_eval,
167
  "is_delta_weight" : is_delta_weight,
168
  "status" : "PENDING"
169
- }
170
-
171
  user_name = ""
172
  model_path = model
173
  if "/" in model:
174
  user_name = model.split("/")[0]
175
  model_path = model.split("/")[1]
176
-
177
  OUT_DIR=f"eval_requests/{user_name}"
178
  os.makedirs(OUT_DIR, exist_ok=True)
179
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
180
-
181
  with open(out_path, "w") as f:
182
  f.write(json.dumps(eval_entry))
183
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
184
-
185
  api = HfApi()
186
  api.upload_file(
187
  path_or_fileobj=out_path,
@@ -191,14 +191,14 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
191
  repo_type="dataset",
192
  )
193
 
194
-
195
  def refresh():
196
  return get_leaderboard(), get_eval_table()
197
-
198
 
199
 
200
  block = gr.Blocks()
201
- with block:
202
  with gr.Row():
203
  gr.Markdown(f"""
204
  # πŸ€— Open LLM Leaderboard
@@ -208,49 +208,47 @@ Evaluation is performed against 4 popular benchmarks:
208
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
209
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
210
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
211
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> Truthful QA MC </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
212
 
213
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
214
  """)
215
-
216
  with gr.Row():
217
  leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
218
  datatype=TYPES, max_rows=5)
219
 
220
-
221
-
222
  with gr.Row():
223
  gr.Markdown(f"""
224
  # Evaluation Queue for the πŸ€— Open LLM Leaderboard, these models will be automatically evaluated on the πŸ€— cluster
225
-
226
  """)
227
  with gr.Accordion("Evaluation Queue", open=False):
228
  with gr.Row():
229
  eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
230
- datatype=EVAL_TYPES, max_rows=5)
231
-
232
  with gr.Row():
233
  refresh_button = gr.Button("Refresh")
234
- refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
235
-
236
  with gr.Accordion("Submit a new model for evaluation"):
237
- # with gr.Row():
238
- # gr.Markdown(f"""# Submit a new model for evaluation""")
239
  with gr.Row():
240
  with gr.Column():
241
  model_name_textbox = gr.Textbox(label="Model name")
242
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
243
-
244
  with gr.Column():
245
  is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
246
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
247
  is_delta_weight = gr.Checkbox(False, label="Delta weights")
248
  base_model_name_textbox = gr.Textbox(label="base model (for delta)")
249
-
250
  with gr.Row():
251
  submit_button = gr.Button("Submit Eval")
252
  submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
253
-
254
 
255
  block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
256
  block.launch()
 
43
  with open(file_path) as fp:
44
  data = json.load(fp)
45
  accs = np.array([v[metric] for k, v in data["results"].items()])
46
+ mean_acc = np.mean(accs)
47
  return mean_acc, data["config"]["model_args"]
48
 
49
 
50
+ COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthfulQA (0-shot) ⬆️"]
51
  TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
52
 
53
  if not IS_PUBLIC:
 
57
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
58
  EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
59
  def get_leaderboard():
60
+ if repo:
61
  print("pulling changes")
62
  repo.git_pull()
63
+
64
  all_data = get_eval_results_dicts(IS_PUBLIC)
65
+
66
  if not IS_PUBLIC:
67
  gpt4_values = {
68
+ "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
69
+ "Revision":"tech report",
70
  "8bit":None,
71
  "Average ⬆️":84.3,
72
  "ARC (25-shot) ⬆️":96.3,
73
  "HellaSwag (10-shot) ⬆️":95.3,
74
  "MMLU (5-shot) ⬆️":86.4,
75
+ "TruthfulQA (0-shot) ⬆️":59.0,
76
  }
77
  all_data.append(gpt4_values)
78
  gpt35_values = {
79
+ "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
80
+ "Revision":"tech report",
81
  "8bit":None,
82
  "Average ⬆️":71.9,
83
  "ARC (25-shot) ⬆️":85.2,
84
  "HellaSwag (10-shot) ⬆️":85.5,
85
  "MMLU (5-shot) ⬆️":70.0,
86
+ "TruthfulQA (0-shot) ⬆️":47.0,
87
  }
88
  all_data.append(gpt35_values)
89
+
90
  dataframe = pd.DataFrame.from_records(all_data)
91
  dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
92
  print(dataframe)
 
94
  return dataframe
95
 
96
  def get_eval_table():
97
+ if repo:
98
  print("pulling changes for eval")
99
  repo.git_pull()
100
+ entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
101
  all_evals = []
102
+
103
  for entry in entries:
104
  print(entry)
105
  if ".json"in entry:
106
  file_path = os.path.join("evals/eval_requests", entry)
107
  with open(file_path) as fp:
108
  data = json.load(fp)
109
+
110
  data["# params"] = "unknown"
111
  data["model"] = make_clickable_model(data["model"])
112
  data["revision"] = data.get("revision", "main")
113
+
114
 
115
  all_evals.append(data)
116
  else:
117
  # this is a folder
118
+ sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
119
  for sub_entry in sub_entries:
120
  file_path = os.path.join("evals/eval_requests", entry, sub_entry)
121
  with open(file_path) as fp:
122
  data = json.load(fp)
123
+
124
  #data["# params"] = get_n_params(data["model"])
125
  data["model"] = make_clickable_model(data["model"])
126
  all_evals.append(data)
127
 
128
+
129
  dataframe = pd.DataFrame.from_records(all_evals)
130
  return dataframe[EVAL_COLS]
131
 
 
137
  try:
138
  config = AutoConfig.from_pretrained(model_name, revision=revision)
139
  return True
140
+
141
  except Exception as e:
142
  print("Could not get the model config from the hub")
143
  print(e)
144
  return False
145
+
146
 
147
 
148
  def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
 
152
  if is_delta_weight and not is_model_on_hub(base_model, revision):
153
  print(base_model, "base model not found on hub")
154
  return
155
+
156
  if not is_model_on_hub(model, revision):
157
  print(model, "not found on hub")
158
  return
159
  print("adding new eval")
160
+
161
  eval_entry = {
162
  "model" : model,
163
  "base_model" : base_model,
 
166
  "8bit_eval" : is_8_bit_eval,
167
  "is_delta_weight" : is_delta_weight,
168
  "status" : "PENDING"
169
+ }
170
+
171
  user_name = ""
172
  model_path = model
173
  if "/" in model:
174
  user_name = model.split("/")[0]
175
  model_path = model.split("/")[1]
176
+
177
  OUT_DIR=f"eval_requests/{user_name}"
178
  os.makedirs(OUT_DIR, exist_ok=True)
179
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
180
+
181
  with open(out_path, "w") as f:
182
  f.write(json.dumps(eval_entry))
183
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
184
+
185
  api = HfApi()
186
  api.upload_file(
187
  path_or_fileobj=out_path,
 
191
  repo_type="dataset",
192
  )
193
 
194
+
195
  def refresh():
196
  return get_leaderboard(), get_eval_table()
197
+
198
 
199
 
200
  block = gr.Blocks()
201
+ with block:
202
  with gr.Row():
203
  gr.Markdown(f"""
204
  # πŸ€— Open LLM Leaderboard
 
208
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
209
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
210
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
211
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
212
 
213
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
214
  """)
215
+
216
  with gr.Row():
217
  leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
218
  datatype=TYPES, max_rows=5)
219
 
220
+
221
+
222
  with gr.Row():
223
  gr.Markdown(f"""
224
  # Evaluation Queue for the πŸ€— Open LLM Leaderboard, these models will be automatically evaluated on the πŸ€— cluster
225
+
226
  """)
227
  with gr.Accordion("Evaluation Queue", open=False):
228
  with gr.Row():
229
  eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
230
+ datatype=EVAL_TYPES, max_rows=5)
231
+
232
  with gr.Row():
233
  refresh_button = gr.Button("Refresh")
234
+ refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
235
+
236
  with gr.Accordion("Submit a new model for evaluation"):
 
 
237
  with gr.Row():
238
  with gr.Column():
239
  model_name_textbox = gr.Textbox(label="Model name")
240
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
241
+
242
  with gr.Column():
243
  is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
244
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
245
  is_delta_weight = gr.Checkbox(False, label="Delta weights")
246
  base_model_name_textbox = gr.Textbox(label="base model (for delta)")
247
+
248
  with gr.Row():
249
  submit_button = gr.Button("Submit Eval")
250
  submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
251
+
252
 
253
  block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
254
  block.launch()
utils.py CHANGED
@@ -21,7 +21,7 @@ BENCH_TO_NAME = {
21
  "arc_challenge":"ARC (25-shot) ⬆️",
22
  "hellaswag":"HellaSwag (10-shot) ⬆️",
23
  "hendrycks":"MMLU (5-shot) ⬆️",
24
- "truthfulqa_mc":"TruthQA (0-shot) ⬆️",
25
  }
26
  def make_clickable_model(model_name):
27
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
 
21
  "arc_challenge":"ARC (25-shot) ⬆️",
22
  "hellaswag":"HellaSwag (10-shot) ⬆️",
23
  "hendrycks":"MMLU (5-shot) ⬆️",
24
+ "truthfulqa_mc":"TruthfulQA (0-shot) ⬆️",
25
  }
26
  def make_clickable_model(model_name):
27
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]