Sean Cho commited on
Commit
07b29ce
β€’
1 Parent(s): 2835e1b

update app

Browse files
Files changed (1) hide show
  1. app.py +491 -4
app.py CHANGED
@@ -1,7 +1,494 @@
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+
6
  import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+ from apscheduler.schedulers.background import BackgroundScheduler
10
+ from huggingface_hub import HfApi
11
+ from transformers import AutoConfig
12
+
13
+ from src.auto_leaderboard.get_model_metadata import apply_metadata
14
+ from src.assets.text_content import *
15
+ from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
+ from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
+ from src.assets.css_html_js import custom_css, get_window_url_params
18
+ from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
+ from src.init import get_all_requested_models, load_all_info_from_hub
20
+
21
+ pd.set_option('display.precision', 1)
22
+
23
+ # clone / pull the lmeh eval data
24
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
25
+
26
+ QUEUE_REPO = "open-llm-leaderboard/requests"
27
+ RESULTS_REPO = "open-llm-leaderboard/results"
28
+
29
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
30
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
31
+
32
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
33
+
34
+ EVAL_REQUESTS_PATH = "eval-queue"
35
+ EVAL_RESULTS_PATH = "eval-results"
36
+
37
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
38
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
39
+
40
+ api = HfApi()
41
+
42
+ def restart_space():
43
+ api.restart_space(
44
+ repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
45
+ )
46
+
47
+ eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
48
+
49
+ if not IS_PUBLIC:
50
+ eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
51
+ else:
52
+ eval_queue_private, eval_results_private = None, None
53
+
54
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
55
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
56
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
57
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
58
+
59
+ if not IS_PUBLIC:
60
+ COLS.insert(2, AutoEvalColumn.precision.name)
61
+ TYPES.insert(2, AutoEvalColumn.precision.type)
62
+
63
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
64
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
65
+
66
+ BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
67
+
68
+
69
+ def has_no_nan_values(df, columns):
70
+ return df[columns].notna().all(axis=1)
71
+
72
+
73
+ def has_nan_values(df, columns):
74
+ return df[columns].isna().any(axis=1)
75
+
76
+
77
+ def get_leaderboard_df():
78
+ if eval_results:
79
+ print("Pulling evaluation results for the leaderboard.")
80
+ eval_results.git_pull()
81
+ if eval_results_private:
82
+ print("Pulling evaluation results for the leaderboard.")
83
+ eval_results_private.git_pull()
84
+
85
+ all_data = get_eval_results_dicts(IS_PUBLIC)
86
+
87
+ if not IS_PUBLIC:
88
+ all_data.append(gpt4_values)
89
+ all_data.append(gpt35_values)
90
+
91
+ all_data.append(baseline)
92
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
93
+
94
+ df = pd.DataFrame.from_records(all_data)
95
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
96
+ df = df[COLS].round(decimals=2)
97
+
98
+ # filter out if any of the benchmarks have not been produced
99
+ df = df[has_no_nan_values(df, BENCHMARK_COLS)]
100
+ return df
101
+
102
+
103
+ def get_evaluation_queue_df():
104
+ if eval_queue:
105
+ print("Pulling changes for the evaluation queue.")
106
+ eval_queue.git_pull()
107
+ if eval_queue_private:
108
+ print("Pulling changes for the evaluation queue.")
109
+ eval_queue_private.git_pull()
110
+
111
+ entries = [
112
+ entry
113
+ for entry in os.listdir(EVAL_REQUESTS_PATH)
114
+ if not entry.startswith(".")
115
+ ]
116
+ all_evals = []
117
+
118
+ for entry in entries:
119
+ if ".json" in entry:
120
+ file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
121
+ with open(file_path) as fp:
122
+ data = json.load(fp)
123
+
124
+ data["# params"] = "unknown"
125
+ data["model"] = make_clickable_model(data["model"])
126
+ data["revision"] = data.get("revision", "main")
127
+
128
+ all_evals.append(data)
129
+ elif ".md" not in entry:
130
+ # this is a folder
131
+ sub_entries = [
132
+ e
133
+ for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
134
+ if not e.startswith(".")
135
+ ]
136
+ for sub_entry in sub_entries:
137
+ file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
138
+ with open(file_path) as fp:
139
+ data = json.load(fp)
140
+
141
+ # data["# params"] = get_n_params(data["model"])
142
+ data["model"] = make_clickable_model(data["model"])
143
+ all_evals.append(data)
144
+
145
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
146
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
147
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
148
+ df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
149
+ df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
150
+ df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
151
+ return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
152
+
153
+
154
+
155
+ original_df = get_leaderboard_df()
156
+ leaderboard_df = original_df.copy()
157
+ (
158
+ finished_eval_queue_df,
159
+ running_eval_queue_df,
160
+ pending_eval_queue_df,
161
+ ) = get_evaluation_queue_df()
162
+
163
+ def is_model_on_hub(model_name, revision) -> bool:
164
+ try:
165
+ AutoConfig.from_pretrained(model_name, revision=revision)
166
+ return True, None
167
+
168
+ except ValueError as e:
169
+ return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
170
+
171
+ except Exception as e:
172
+ print(f"Could not get the model config from the hub.: {e}")
173
+ return False, "was not found on hub!"
174
+
175
+
176
+ def add_new_eval(
177
+ model: str,
178
+ base_model: str,
179
+ revision: str,
180
+ precision: str,
181
+ private: bool,
182
+ weight_type: str,
183
+ model_type: str,
184
+ ):
185
+ precision = precision.split(" ")[0]
186
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
187
+
188
+ if model_type is None or model_type == "":
189
+ return styled_error("Please select a model type.")
190
+
191
+ # check the model actually exists before adding the eval
192
+ if revision == "":
193
+ revision = "main"
194
+
195
+ if weight_type in ["Delta", "Adapter"]:
196
+ base_model_on_hub, error = is_model_on_hub(base_model, revision)
197
+ if not base_model_on_hub:
198
+ return styled_error(f'Base model "{base_model}" {error}')
199
+
200
+
201
+ if not weight_type == "Adapter":
202
+ model_on_hub, error = is_model_on_hub(model, revision)
203
+ if not model_on_hub:
204
+ return styled_error(f'Model "{model}" {error}')
205
+
206
+ print("adding new eval")
207
+
208
+ eval_entry = {
209
+ "model": model,
210
+ "base_model": base_model,
211
+ "revision": revision,
212
+ "private": private,
213
+ "precision": precision,
214
+ "weight_type": weight_type,
215
+ "status": "PENDING",
216
+ "submitted_time": current_time,
217
+ "model_type": model_type,
218
+ }
219
+
220
+ user_name = ""
221
+ model_path = model
222
+ if "/" in model:
223
+ user_name = model.split("/")[0]
224
+ model_path = model.split("/")[1]
225
+
226
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
227
+ os.makedirs(OUT_DIR, exist_ok=True)
228
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
+
230
+ # Check for duplicate submission
231
+ if out_path.split("eval-queue/")[1].lower() in requested_models:
232
+ return styled_warning("This model has been already submitted.")
233
+
234
+ with open(out_path, "w") as f:
235
+ f.write(json.dumps(eval_entry))
236
+
237
+ api.upload_file(
238
+ path_or_fileobj=out_path,
239
+ path_in_repo=out_path.split("eval-queue/")[1],
240
+ repo_id=QUEUE_REPO,
241
+ token=H4_TOKEN,
242
+ repo_type="dataset",
243
+ commit_message=f"Add {model} to eval queue",
244
+ )
245
+
246
+ # remove the local file
247
+ os.remove(out_path)
248
+
249
+ return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
250
+
251
+
252
+ def refresh():
253
+ leaderboard_df = get_leaderboard_df()
254
+ (
255
+ finished_eval_queue_df,
256
+ running_eval_queue_df,
257
+ pending_eval_queue_df,
258
+ ) = get_evaluation_queue_df()
259
+ return (
260
+ leaderboard_df,
261
+ finished_eval_queue_df,
262
+ running_eval_queue_df,
263
+ pending_eval_queue_df,
264
+ )
265
+
266
+
267
+ def search_table(df, leaderboard_table, query):
268
+ if AutoEvalColumn.model_type.name in leaderboard_table.columns:
269
+ filtered_df = df[
270
+ (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
271
+ | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
272
+ ]
273
+ else:
274
+ filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
275
+ return filtered_df[leaderboard_table.columns]
276
+
277
+
278
+ def select_columns(df, columns):
279
+ always_here_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
280
+ # We use COLS to maintain sorting
281
+ filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]]
282
+ return filtered_df
283
+
284
+ #TODO allow this to filter by values of any columns
285
+ def filter_items(df, leaderboard_table, query):
286
+ if query == "all":
287
+ return df[leaderboard_table.columns]
288
+ else:
289
+ query = query[0] #take only the emoji character
290
+ if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
291
+ filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
292
+ else:
293
+ return leaderboard_table.columns
294
+ return filtered_df[leaderboard_table.columns]
295
+
296
+ def change_tab(query_param):
297
+ query_param = query_param.replace("'", '"')
298
+ query_param = json.loads(query_param)
299
+
300
+ if (
301
+ isinstance(query_param, dict)
302
+ and "tab" in query_param
303
+ and query_param["tab"] == "evaluation"
304
+ ):
305
+ return gr.Tabs.update(selected=1)
306
+ else:
307
+ return gr.Tabs.update(selected=0)
308
+
309
+
310
+ demo = gr.Blocks(css=custom_css)
311
+ with demo:
312
+ gr.HTML(TITLE)
313
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
314
+
315
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
316
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
317
+ with gr.Row():
318
+ shown_columns = gr.CheckboxGroup(
319
+ choices = [c for c in COLS if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
320
+ value = [c for c in COLS_LITE if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
321
+ label="Select columns to show",
322
+ elem_id="column-select",
323
+ interactive=True,
324
+ )
325
+ with gr.Column(min_width=320):
326
+ search_bar = gr.Textbox(
327
+ placeholder="πŸ” Search for your model and press ENTER...",
328
+ show_label=False,
329
+ elem_id="search-bar",
330
+ )
331
+ filter_columns = gr.Radio(
332
+ label="⏚ Filter model types",
333
+ choices = [
334
+ "all",
335
+ ModelType.PT.to_str(),
336
+ ModelType.FT.to_str(),
337
+ ModelType.IFT.to_str(),
338
+ ModelType.RL.to_str(),
339
+ ],
340
+ value="all",
341
+ elem_id="filter-columns"
342
+ )
343
+ leaderboard_table = gr.components.Dataframe(
344
+ value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value+ [AutoEvalColumn.dummy.name]],
345
+ headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
346
+ datatype=TYPES,
347
+ max_rows=None,
348
+ elem_id="leaderboard-table",
349
+ interactive=False,
350
+ visible=True,
351
+ )
352
+
353
+ # Dummy leaderboard for handling the case when the user uses backspace key
354
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
355
+ value=original_df,
356
+ headers=COLS,
357
+ datatype=TYPES,
358
+ max_rows=None,
359
+ visible=False,
360
+ )
361
+ search_bar.submit(
362
+ search_table,
363
+ [hidden_leaderboard_table_for_search, leaderboard_table, search_bar],
364
+ leaderboard_table,
365
+ )
366
+ shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table)
367
+ filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table)
368
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
369
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
370
+
371
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
372
+ with gr.Column():
373
+ with gr.Row():
374
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
375
+
376
+ with gr.Column():
377
+ with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
378
+ with gr.Row():
379
+ finished_eval_table = gr.components.Dataframe(
380
+ value=finished_eval_queue_df,
381
+ headers=EVAL_COLS,
382
+ datatype=EVAL_TYPES,
383
+ max_rows=5,
384
+ )
385
+ with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
386
+ with gr.Row():
387
+ running_eval_table = gr.components.Dataframe(
388
+ value=running_eval_queue_df,
389
+ headers=EVAL_COLS,
390
+ datatype=EVAL_TYPES,
391
+ max_rows=5,
392
+ )
393
+
394
+ with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
395
+ with gr.Row():
396
+ pending_eval_table = gr.components.Dataframe(
397
+ value=pending_eval_queue_df,
398
+ headers=EVAL_COLS,
399
+ datatype=EVAL_TYPES,
400
+ max_rows=5,
401
+ )
402
+ with gr.Row():
403
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
404
+
405
+ with gr.Row():
406
+ with gr.Column():
407
+ model_name_textbox = gr.Textbox(label="Model name")
408
+ revision_name_textbox = gr.Textbox(
409
+ label="revision", placeholder="main"
410
+ )
411
+ private = gr.Checkbox(
412
+ False, label="Private", visible=not IS_PUBLIC
413
+ )
414
+ model_type = gr.Dropdown(
415
+ choices=[
416
+ ModelType.PT.to_str(" : "),
417
+ ModelType.FT.to_str(" : "),
418
+ ModelType.IFT.to_str(" : "),
419
+ ModelType.RL.to_str(" : "),
420
+ ],
421
+ label="Model type",
422
+ multiselect=False,
423
+ value=None,
424
+ interactive=True,
425
+ )
426
+
427
+ with gr.Column():
428
+ precision = gr.Dropdown(
429
+ choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
430
+ label="Precision",
431
+ multiselect=False,
432
+ value="float16",
433
+ interactive=True,
434
+ )
435
+ weight_type = gr.Dropdown(
436
+ choices=["Original", "Delta", "Adapter"],
437
+ label="Weights type",
438
+ multiselect=False,
439
+ value="Original",
440
+ interactive=True,
441
+ )
442
+ base_model_name_textbox = gr.Textbox(
443
+ label="Base model (for delta or adapter weights)"
444
+ )
445
+
446
+ submit_button = gr.Button("Submit Eval")
447
+ submission_result = gr.Markdown()
448
+ submit_button.click(
449
+ add_new_eval,
450
+ [
451
+ model_name_textbox,
452
+ base_model_name_textbox,
453
+ revision_name_textbox,
454
+ precision,
455
+ private,
456
+ weight_type,
457
+ model_type
458
+ ],
459
+ submission_result,
460
+ )
461
+
462
+ with gr.Row():
463
+ refresh_button = gr.Button("Refresh")
464
+ refresh_button.click(
465
+ refresh,
466
+ inputs=[],
467
+ outputs=[
468
+ leaderboard_table,
469
+ finished_eval_table,
470
+ running_eval_table,
471
+ pending_eval_table,
472
+ ],
473
+ )
474
+
475
+ with gr.Row():
476
+ with gr.Accordion("πŸ“™ Citation", open=False):
477
+ citation_button = gr.Textbox(
478
+ value=CITATION_BUTTON_TEXT,
479
+ label=CITATION_BUTTON_LABEL,
480
+ elem_id="citation-button",
481
+ ).style(show_copy_button=True)
482
 
483
+ dummy = gr.Textbox(visible=False)
484
+ demo.load(
485
+ change_tab,
486
+ dummy,
487
+ tabs,
488
+ _js=get_window_url_params,
489
+ )
490
 
491
+ scheduler = BackgroundScheduler()
492
+ scheduler.add_job(restart_space, "interval", seconds=3600)
493
+ scheduler.start()
494
+ demo.queue(concurrency_count=40).launch()