edbeeching commited on
Commit
1f60a20
1 Parent(s): 9346f1c

updates eval leaderboard so new evals can be added

Browse files
Files changed (1) hide show
  1. app.py +175 -22
app.py CHANGED
@@ -2,21 +2,27 @@ import os
2
  import shutil
3
  import numpy as np
4
  import gradio as gr
5
- from huggingface_hub import Repository
 
6
  import json
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  import pandas as pd
 
 
9
  # clone / pull the lmeh eval data
10
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
 
 
11
  repo=None
12
  if H4_TOKEN:
 
13
  # try:
14
  # shutil.rmtree("./evals/")
15
  # except:
16
  # pass
17
 
18
  repo = Repository(
19
- local_dir="./evals/", clone_from="HuggingFaceH4/lmeh_evaluations", use_auth_token=H4_TOKEN, repo_type="dataset"
20
  )
21
  repo.git_pull()
22
 
@@ -24,16 +30,13 @@ if H4_TOKEN:
24
  # parse the results
25
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
26
  BENCH_TO_NAME = {
27
- "arc_challenge":"ARC",
28
- "hellaswag":"HellaSwag",
29
- "hendrycks":"MMLU",
30
- "truthfulqa_mc":"TruthQA",
31
  }
32
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
33
 
34
- entries = [entry for entry in os.listdir("evals") if not entry.startswith('.')]
35
- model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))]
36
-
37
 
38
  def make_clickable_model(model_name):
39
  # remove user from model name
@@ -53,11 +56,34 @@ def load_results(model, benchmark, metric):
53
  mean_acc = np.mean(accs)
54
  return mean_acc, data["config"]["model_args"]
55
 
56
- COLS = ["eval_name", "total", "ARC", "HellaSwag", "MMLU", "TruthQA", "base_model"]
57
- TYPES = ["str", "number", "number", "number", "number", "number","markdown", ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def get_leaderboard():
59
  if repo:
 
60
  repo.git_pull()
 
 
61
  all_data = []
62
  for model in model_directories:
63
  model_data = {"base_model": None}
@@ -65,46 +91,173 @@ def get_leaderboard():
65
 
66
  for benchmark, metric in zip(BENCHMARKS, METRICS):
67
  value, base_model = load_results(model, benchmark, metric)
68
- model_data[BENCH_TO_NAME[benchmark]] = value
69
  if base_model is not None: # in case the last benchmark failed
70
  model_data["base_model"] = base_model
71
 
72
- model_data["total"] = sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values())
73
 
74
  if model_data["base_model"] is not None:
75
  model_data["base_model"] = make_clickable_model(model_data["base_model"])
 
 
 
76
  all_data.append(model_data)
77
 
78
  dataframe = pd.DataFrame.from_records(all_data)
79
- dataframe = dataframe.sort_values(by=['total'], ascending=False)
80
 
81
  dataframe = dataframe[COLS]
82
  return dataframe
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  leaderboard = get_leaderboard()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  block = gr.Blocks()
87
  with block:
88
- gr.Markdown(f"""
89
- # H4 Model Evaluation leaderboard using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> LMEH benchmark suite </a>.
90
- Evaluation is performed against 4 popular benchmarks AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthFul QC MC. To run your own benchmarks, refer to the README in the H4 repo.
91
- """)
 
92
 
93
  with gr.Row():
94
  leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
95
  datatype=TYPES, max_rows=5)
 
 
 
96
  with gr.Row():
97
- refresh_button = gr.Button("Refresh")
98
- refresh_button.click(get_leaderboard, inputs=[], outputs=leaderboard_table)
99
 
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- block.launch()
103
 
 
 
104
  def refresh_leaderboard():
105
  leaderboard_table = get_leaderboard()
106
  print("leaderboard updated")
107
 
108
  scheduler = BackgroundScheduler()
109
  scheduler.add_job(func=refresh_leaderboard, trigger="interval", seconds=300) # refresh every 5 mins
110
- scheduler.start()
 
 
2
  import shutil
3
  import numpy as np
4
  import gradio as gr
5
+ from huggingface_hub import Repository, HfApi
6
+ from transformers import AutoConfig
7
  import json
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
10
+ import datetime
11
+
12
  # clone / pull the lmeh eval data
13
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
14
+ LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
15
+
16
  repo=None
17
  if H4_TOKEN:
18
+ print("pulling repo")
19
  # try:
20
  # shutil.rmtree("./evals/")
21
  # except:
22
  # pass
23
 
24
  repo = Repository(
25
+ local_dir="./evals/", clone_from=LMEH_REPO, use_auth_token=H4_TOKEN, repo_type="dataset"
26
  )
27
  repo.git_pull()
28
 
30
  # parse the results
31
  BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
32
  BENCH_TO_NAME = {
33
+ "arc_challenge":"ARC (25-shot) ⬆️",
34
+ "hellaswag":"HellaSwag (10-shot) ⬆️",
35
+ "hendrycks":"MMLU (5-shot) ⬆️",
36
+ "truthfulqa_mc":"TruthQA (0-shot) ⬆️",
37
  }
38
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
39
 
 
 
 
40
 
41
  def make_clickable_model(model_name):
42
  # remove user from model name
56
  mean_acc = np.mean(accs)
57
  return mean_acc, data["config"]["model_args"]
58
 
59
+ def get_n_params(base_model):
60
+
61
+ # config = AutoConfig.from_pretrained(model_name)
62
+
63
+ # # Retrieve the number of parameters from the configuration
64
+ # try:
65
+ # num_params = config.n_parameters
66
+ # except AttributeError:
67
+ # print(f"Error: The number of parameters is not available in the config for the model '{model_name}'.")
68
+ # return None
69
+
70
+ # return num_params
71
+
72
+ now = datetime.datetime.now()
73
+ time_string = now.strftime("%Y-%m-%d %H:%M:%S")
74
+ return time_string
75
+
76
+ COLS = ["eval_name", "# params", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️", "base_model"]
77
+ TYPES = ["str","str", "number", "number", "number", "number", "number","markdown", ]
78
+
79
+ EVAL_COLS = ["model","# params", "private", "8bit_eval", "is_delta_weight", "status"]
80
+ EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
81
  def get_leaderboard():
82
  if repo:
83
+ print("pulling changes")
84
  repo.git_pull()
85
+ entries = [entry for entry in os.listdir("evals") if not (entry.startswith('.') or entry=="eval_requests")]
86
+ model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))]
87
  all_data = []
88
  for model in model_directories:
89
  model_data = {"base_model": None}
91
 
92
  for benchmark, metric in zip(BENCHMARKS, METRICS):
93
  value, base_model = load_results(model, benchmark, metric)
94
+ model_data[BENCH_TO_NAME[benchmark]] = round(value,3)
95
  if base_model is not None: # in case the last benchmark failed
96
  model_data["base_model"] = base_model
97
 
98
+ model_data["total ⬆️"] = round(sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values()),3)
99
 
100
  if model_data["base_model"] is not None:
101
  model_data["base_model"] = make_clickable_model(model_data["base_model"])
102
+
103
+ model_data["# params"] = get_n_params(model_data["base_model"])
104
+
105
  all_data.append(model_data)
106
 
107
  dataframe = pd.DataFrame.from_records(all_data)
108
+ dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
109
 
110
  dataframe = dataframe[COLS]
111
  return dataframe
112
 
113
+ def get_eval_table():
114
+ if repo:
115
+ print("pulling changes for eval")
116
+ repo.git_pull()
117
+ entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
118
+ all_evals = []
119
+
120
+ for entry in entries:
121
+ print(entry)
122
+ if ".json"in entry:
123
+ file_path = os.path.join("evals/eval_requests", entry)
124
+ with open(file_path) as fp:
125
+ data = json.load(fp)
126
+
127
+ data["# params"] = get_n_params(data["model"])
128
+ data["model"] = make_clickable_model(data["model"])
129
+
130
+
131
+ all_evals.append(data)
132
+ else:
133
+ # this is a folder
134
+ sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
135
+ for sub_entry in sub_entries:
136
+ file_path = os.path.join("evals/eval_requests", entry, sub_entry)
137
+ with open(file_path) as fp:
138
+ data = json.load(fp)
139
+
140
+ data["# params"] = get_n_params(data["model"])
141
+ data["model"] = make_clickable_model(data["model"])
142
+ all_evals.append(data)
143
+
144
+
145
+ dataframe = pd.DataFrame.from_records(all_evals)
146
+ return dataframe[EVAL_COLS]
147
+
148
+
149
  leaderboard = get_leaderboard()
150
+ eval_queue = get_eval_table()
151
+
152
+ def is_model_on_hub(model_name) -> bool:
153
+ try:
154
+ config = AutoConfig.from_pretrained(model_name)
155
+ return True
156
+
157
+ except Exception as e:
158
+ print("Could not get the model config from the hub")
159
+ print(e)
160
+ return False
161
+
162
+
163
+
164
+ def add_new_eval(model:str, private:bool, is_8_bit_eval: bool, is_delta_weight:bool):
165
+ # check the model actually exists before adding the eval
166
+ if not is_model_on_hub(model):
167
+ print(model, "not found on hub")
168
+ return
169
+ print("adding new eval")
170
+
171
+ eval_entry = {
172
+ "model" : model,
173
+ "private" : private,
174
+ "8bit_eval" : is_8_bit_eval,
175
+ "is_delta_weight" : is_delta_weight,
176
+ "status" : "PENDING"
177
+ }
178
+
179
+ user_name = ""
180
+ model_path = model
181
+ if "/" in model:
182
+ user_name = model.split("/")[0]
183
+ model_path = model.split("/")[1]
184
+
185
+ OUT_DIR=f"eval_requests/{user_name}"
186
+ os.makedirs(OUT_DIR, exist_ok=True)
187
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
188
+
189
+ with open(out_path, "w") as f:
190
+ f.write(json.dumps(eval_entry))
191
+ LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
192
+
193
+ api = HfApi()
194
+ api.upload_file(
195
+ path_or_fileobj=out_path,
196
+ path_in_repo=out_path,
197
+ repo_id=LMEH_REPO,
198
+ token=H4_TOKEN,
199
+ repo_type="dataset",
200
+ )
201
+
202
+
203
+ def refresh():
204
+ return get_leaderboard(), get_eval_table()
205
+
206
+
207
 
208
  block = gr.Blocks()
209
  with block:
210
+ with gr.Row():
211
+ gr.Markdown(f"""
212
+ # 🤗 H4 Model Evaluation leaderboard using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> LMEH benchmark suite </a>.
213
+ Evaluation is performed against 4 popular benchmarks AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthFul QC MC. To run your own benchmarks, refer to the README in the H4 repo.
214
+ """)
215
 
216
  with gr.Row():
217
  leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
218
  datatype=TYPES, max_rows=5)
219
+
220
+
221
+
222
  with gr.Row():
223
+ gr.Markdown(f"""
224
+ # Evaluation Queue for the LMEH benchmarks, these models will be automatically evaluated on the 🤗 cluster
225
 
226
+ """)
227
 
228
+ with gr.Row():
229
+ eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
230
+ datatype=EVAL_TYPES, max_rows=5)
231
+
232
+ with gr.Row():
233
+ refresh_button = gr.Button("Refresh")
234
+ refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
235
+
236
+ with gr.Accordion("Submit a new model for evaluation"):
237
+ # with gr.Row():
238
+ # gr.Markdown(f"""# Submit a new model for evaluation""")
239
+ with gr.Row():
240
+ model_name_textbox = gr.Textbox(label="model_name")
241
+ is_8bit_toggle = gr.Checkbox(False, label="8 bit Eval?")
242
+ private = gr.Checkbox(False, label="Private?")
243
+ is_delta_weight = gr.Checkbox(False, label="Delta Weights?")
244
+
245
+ with gr.Row():
246
+ submit_button = gr.Button("Submit Eval")
247
+ submit_button.click(add_new_eval, [model_name_textbox, is_8bit_toggle, private, is_delta_weight])
248
+
249
+
250
+
251
 
 
252
 
253
+
254
+ print("adding refresh leaderboard")
255
  def refresh_leaderboard():
256
  leaderboard_table = get_leaderboard()
257
  print("leaderboard updated")
258
 
259
  scheduler = BackgroundScheduler()
260
  scheduler.add_job(func=refresh_leaderboard, trigger="interval", seconds=300) # refresh every 5 mins
261
+ scheduler.start()
262
+
263
+ block.launch()