Sean Cho commited on
Commit
bcb8d03
β€’
1 Parent(s): 2a9714f

update to latest

Browse files
README.md CHANGED
@@ -4,10 +4,11 @@ emoji: πŸ“‰
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -222,21 +222,6 @@ def add_new_eval(
222
 
223
 
224
  # Basics
225
- def refresh() -> list[pd.DataFrame]:
226
- leaderboard_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
227
- (
228
- finished_eval_queue_df,
229
- running_eval_queue_df,
230
- pending_eval_queue_df,
231
- ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
232
- return (
233
- leaderboard_df,
234
- finished_eval_queue_df,
235
- running_eval_queue_df,
236
- pending_eval_queue_df,
237
- )
238
-
239
-
240
  def change_tab(query_param: str):
241
  query_param = query_param.replace("'", '"')
242
  query_param = json.loads(query_param)
@@ -248,17 +233,16 @@ def change_tab(query_param: str):
248
 
249
 
250
  # Searching and filtering
251
- def search_table(df: pd.DataFrame, current_columns_df: pd.DataFrame, query: str) -> pd.DataFrame:
252
- current_columns = current_columns_df.columns
253
- if AutoEvalColumn.model_type.name in current_columns:
254
- filtered_df = df[
255
- (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
256
- | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
257
- ]
258
- else:
259
- filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
260
- return filtered_df[current_columns]
261
 
 
 
262
 
263
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
264
  always_here_cols = [
@@ -272,31 +256,32 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
272
  return filtered_df
273
 
274
  NUMERIC_INTERVALS = {
275
- "< 1.5B": (0, 1.5),
276
- "~3B": (1.5, 5),
277
- "~7B": (6, 11),
278
- "~13B": (12, 15),
279
- # "~35B": (16, 55),
280
- # "60B+": (55, 10000),
 
281
  }
282
 
283
  def filter_models(
284
- df: pd.DataFrame, current_columns_df: pd.DataFrame, type_query: list, size_query: list, show_deleted: bool
285
  ) -> pd.DataFrame:
286
- current_columns = current_columns_df.columns
287
-
288
  # Show all models
289
  if show_deleted:
290
- filtered_df = df[current_columns]
291
  else: # Show only still on the hub models
292
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True][current_columns]
293
 
294
  type_emoji = [t[0] for t in type_query]
295
  filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
 
296
 
297
- numeric_interval = [NUMERIC_INTERVALS[s] for s in size_query]
298
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
299
- filtered_df = filtered_df[params_column.between(numeric_interval[0][0], numeric_interval[-1][1])]
 
300
 
301
  return filtered_df
302
 
@@ -310,6 +295,12 @@ with demo:
310
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
311
  with gr.Row():
312
  with gr.Column():
 
 
 
 
 
 
313
  with gr.Row():
314
  shown_columns = gr.CheckboxGroup(
315
  choices=[
@@ -343,11 +334,6 @@ with demo:
343
  value=True, label="πŸ‘€ Show gated/private/deleted models", interactive=True
344
  )
345
  with gr.Column(min_width=320):
346
- search_bar = gr.Textbox(
347
- placeholder="πŸ” Search for your model and press ENTER...",
348
- show_label=False,
349
- elem_id="search-bar",
350
- )
351
  with gr.Box(elem_id="box-filter"):
352
  filter_columns_type = gr.CheckboxGroup(
353
  label="Model types",
@@ -366,6 +352,13 @@ with demo:
366
  interactive=True,
367
  elem_id="filter-columns-type",
368
  )
 
 
 
 
 
 
 
369
  filter_columns_size = gr.CheckboxGroup(
370
  label="Model sizes",
371
  choices=list(NUMERIC_INTERVALS.keys()),
@@ -402,55 +395,93 @@ with demo:
402
  visible=False,
403
  )
404
  search_bar.submit(
405
- search_table,
406
  [
407
  hidden_leaderboard_table_for_search,
408
  leaderboard_table,
 
 
 
 
 
409
  search_bar,
410
  ],
411
  leaderboard_table,
412
  )
413
  shown_columns.change(
414
- select_columns,
415
- [hidden_leaderboard_table_for_search, shown_columns],
 
 
 
 
 
 
 
 
 
416
  leaderboard_table,
417
- queue=False,
418
  )
419
  filter_columns_type.change(
420
- filter_models,
421
  [
422
  hidden_leaderboard_table_for_search,
423
  leaderboard_table,
 
424
  filter_columns_type,
 
425
  filter_columns_size,
426
  deleted_models_visibility,
 
427
  ],
428
  leaderboard_table,
429
- queue=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  )
431
  filter_columns_size.change(
432
- filter_models,
433
  [
434
  hidden_leaderboard_table_for_search,
435
  leaderboard_table,
 
436
  filter_columns_type,
 
437
  filter_columns_size,
438
  deleted_models_visibility,
 
439
  ],
440
  leaderboard_table,
441
- queue=False,
442
  )
443
  deleted_models_visibility.change(
444
- filter_models,
445
  [
446
  hidden_leaderboard_table_for_search,
447
  leaderboard_table,
 
448
  filter_columns_type,
 
449
  filter_columns_size,
450
  deleted_models_visibility,
 
451
  ],
452
  leaderboard_table,
453
- queue=False,
454
  )
455
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
456
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -556,20 +587,6 @@ with demo:
556
  submission_result,
557
  )
558
 
559
- with gr.Row():
560
- refresh_button = gr.Button("Refresh")
561
- refresh_button.click(
562
- refresh,
563
- inputs=[],
564
- outputs=[
565
- leaderboard_table,
566
- finished_eval_table,
567
- running_eval_table,
568
- pending_eval_table,
569
- ],
570
- api_name='refresh'
571
- )
572
-
573
  with gr.Row():
574
  with gr.Accordion("πŸ“™ Citation", open=False):
575
  citation_button = gr.Textbox(
@@ -589,6 +606,6 @@ with demo:
589
  )
590
 
591
  scheduler = BackgroundScheduler()
592
- scheduler.add_job(restart_space, "interval", seconds=3600)
593
  scheduler.start()
594
  demo.queue(concurrency_count=40).launch()
 
222
 
223
 
224
  # Basics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  def change_tab(query_param: str):
226
  query_param = query_param.replace("'", '"')
227
  query_param = json.loads(query_param)
 
233
 
234
 
235
  # Searching and filtering
236
+ def update_table(hidden_df: pd.DataFrame, current_columns_df: pd.DataFrame, columns: list, type_query: list, precision_query: str, size_query: list, show_deleted: bool, query: str):
237
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
238
+ if query != "":
239
+ filtered_df = search_table(filtered_df, query)
240
+ df = select_columns(filtered_df, columns)
241
+
242
+ return df
 
 
 
243
 
244
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
245
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
246
 
247
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
248
  always_here_cols = [
 
256
  return filtered_df
257
 
258
  NUMERIC_INTERVALS = {
259
+ "Unknown": pd.Interval(-1, 0, closed="right"),
260
+ "< 1.5B": pd.Interval(0, 1.5, closed="right"),
261
+ "~3B": pd.Interval(1.5, 5, closed="right"),
262
+ "~7B": pd.Interval(6, 11, closed="right"),
263
+ "~13B": pd.Interval(12, 15, closed="right"),
264
+ # "~35B": pd.Interval(16, 55, closed="right"),
265
+ # "60B+": pd.Interval(55, 10000, closed="right"),
266
  }
267
 
268
  def filter_models(
269
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
270
  ) -> pd.DataFrame:
 
 
271
  # Show all models
272
  if show_deleted:
273
+ filtered_df = df
274
  else: # Show only still on the hub models
275
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
276
 
277
  type_emoji = [t[0] for t in type_query]
278
  filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
279
+ filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query)]
280
 
281
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
282
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
283
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
284
+ filtered_df = filtered_df.loc[mask]
285
 
286
  return filtered_df
287
 
 
295
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
296
  with gr.Row():
297
  with gr.Column():
298
+ with gr.Row():
299
+ search_bar = gr.Textbox(
300
+ placeholder=" πŸ” Search for your model and press ENTER...",
301
+ show_label=False,
302
+ elem_id="search-bar",
303
+ )
304
  with gr.Row():
305
  shown_columns = gr.CheckboxGroup(
306
  choices=[
 
334
  value=True, label="πŸ‘€ Show gated/private/deleted models", interactive=True
335
  )
336
  with gr.Column(min_width=320):
 
 
 
 
 
337
  with gr.Box(elem_id="box-filter"):
338
  filter_columns_type = gr.CheckboxGroup(
339
  label="Model types",
 
352
  interactive=True,
353
  elem_id="filter-columns-type",
354
  )
355
+ filter_columns_precision = gr.CheckboxGroup(
356
+ label="Precision",
357
+ choices=["torch.float16"], #, "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
358
+ value=["torch.float16"], #, "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
359
+ interactive=False,
360
+ elem_id="filter-columns-precision",
361
+ )
362
  filter_columns_size = gr.CheckboxGroup(
363
  label="Model sizes",
364
  choices=list(NUMERIC_INTERVALS.keys()),
 
395
  visible=False,
396
  )
397
  search_bar.submit(
398
+ update_table,
399
  [
400
  hidden_leaderboard_table_for_search,
401
  leaderboard_table,
402
+ shown_columns,
403
+ filter_columns_type,
404
+ filter_columns_precision,
405
+ filter_columns_size,
406
+ deleted_models_visibility,
407
  search_bar,
408
  ],
409
  leaderboard_table,
410
  )
411
  shown_columns.change(
412
+ update_table,
413
+ [
414
+ hidden_leaderboard_table_for_search,
415
+ leaderboard_table,
416
+ shown_columns,
417
+ filter_columns_type,
418
+ filter_columns_precision,
419
+ filter_columns_size,
420
+ deleted_models_visibility,
421
+ search_bar,
422
+ ],
423
  leaderboard_table,
424
+ queue=True,
425
  )
426
  filter_columns_type.change(
427
+ update_table,
428
  [
429
  hidden_leaderboard_table_for_search,
430
  leaderboard_table,
431
+ shown_columns,
432
  filter_columns_type,
433
+ filter_columns_precision,
434
  filter_columns_size,
435
  deleted_models_visibility,
436
+ search_bar,
437
  ],
438
  leaderboard_table,
439
+ queue=True,
440
+ )
441
+ filter_columns_precision.change(
442
+ update_table,
443
+ [
444
+ hidden_leaderboard_table_for_search,
445
+ leaderboard_table,
446
+ shown_columns,
447
+ filter_columns_type,
448
+ filter_columns_precision,
449
+ filter_columns_size,
450
+ deleted_models_visibility,
451
+ search_bar,
452
+ ],
453
+ leaderboard_table,
454
+ queue=True,
455
  )
456
  filter_columns_size.change(
457
+ update_table,
458
  [
459
  hidden_leaderboard_table_for_search,
460
  leaderboard_table,
461
+ shown_columns,
462
  filter_columns_type,
463
+ filter_columns_precision,
464
  filter_columns_size,
465
  deleted_models_visibility,
466
+ search_bar,
467
  ],
468
  leaderboard_table,
469
+ queue=True,
470
  )
471
  deleted_models_visibility.change(
472
+ update_table,
473
  [
474
  hidden_leaderboard_table_for_search,
475
  leaderboard_table,
476
+ shown_columns,
477
  filter_columns_type,
478
+ filter_columns_precision,
479
  filter_columns_size,
480
  deleted_models_visibility,
481
+ search_bar,
482
  ],
483
  leaderboard_table,
484
+ queue=True,
485
  )
486
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
487
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
587
  submission_result,
588
  )
589
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  with gr.Row():
591
  with gr.Accordion("πŸ“™ Citation", open=False):
592
  citation_button = gr.Textbox(
 
606
  )
607
 
608
  scheduler = BackgroundScheduler()
609
+ scheduler.add_job(restart_space, "interval", seconds=1800)
610
  scheduler.start()
611
  demo.queue(concurrency_count=40).launch()
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  aiofiles==23.1.0
2
  aiohttp==3.8.4
3
  aiosignal==1.3.1
@@ -19,8 +20,8 @@ filelock==3.11.0
19
  fonttools==4.39.3
20
  frozenlist==1.3.3
21
  fsspec==2023.4.0
22
- gradio==3.27.0
23
- gradio_client==0.1.3
24
  h11==0.14.0
25
  httpcore==0.17.0
26
  httpx==0.24.0
@@ -59,7 +60,7 @@ sniffio==1.3.0
59
  starlette==0.26.1
60
  toolz==0.12.0
61
  tqdm==4.65.0
62
- transformers==4.32.0
63
  typing_extensions==4.5.0
64
  tzdata==2023.3
65
  tzlocal==4.3
 
1
+ accelerate==0.23.0
2
  aiofiles==23.1.0
3
  aiohttp==3.8.4
4
  aiosignal==1.3.1
 
20
  fonttools==4.39.3
21
  frozenlist==1.3.3
22
  fsspec==2023.4.0
23
+ gradio==3.43.2
24
+ gradio-client==0.5.0
25
  h11==0.14.0
26
  httpcore==0.17.0
27
  httpx==0.24.0
 
60
  starlette==0.26.1
61
  toolz==0.12.0
62
  tqdm==4.65.0
63
+ transformers==4.33.1
64
  typing_extensions==4.5.0
65
  tzdata==2023.3
66
  tzlocal==4.3
src/display_models/get_model_metadata.py CHANGED
@@ -2,11 +2,14 @@ import glob
2
  import json
3
  import os
4
  import re
 
5
  from typing import List
6
 
7
  import huggingface_hub
8
  from huggingface_hub import HfApi
9
  from tqdm import tqdm
 
 
10
 
11
  from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
12
  from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
@@ -16,27 +19,53 @@ api = HfApi(token=os.environ.get("H4_TOKEN", None))
16
 
17
 
18
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
 
 
 
 
 
 
 
 
 
 
 
 
19
  for model_data in tqdm(leaderboard_data):
20
  model_name = model_data["model_name_for_query"]
21
- try:
22
- model_info = api.model_info(model_name)
23
- except huggingface_hub.utils._errors.RepositoryNotFoundError:
24
- print("Repo not found!", model_name)
25
- model_data[AutoEvalColumn.license.name] = None
26
- model_data[AutoEvalColumn.likes.name] = None
27
- model_data[AutoEvalColumn.params.name] = get_model_size(model_name, None)
28
- continue
 
 
 
 
 
 
29
 
30
  model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
31
  model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
32
- model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
 
 
 
 
 
 
 
 
33
 
34
 
35
  def get_model_license(model_info):
36
  try:
37
  return model_info.cardData["license"]
38
  except Exception:
39
- return None
40
 
41
 
42
  def get_model_likes(model_info):
@@ -52,11 +81,17 @@ def get_model_size(model_name, model_info):
52
  return round(model_info.safetensors["total"] / 1e9, 3)
53
  except AttributeError:
54
  try:
55
- size_match = re.search(size_pattern, model_name.lower())
56
- size = size_match.group(0)
57
- return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
58
- except AttributeError:
59
- return None
 
 
 
 
 
 
60
 
61
 
62
  def get_model_type(leaderboard_data: List[dict]):
 
2
  import json
3
  import os
4
  import re
5
+ import pickle
6
  from typing import List
7
 
8
  import huggingface_hub
9
  from huggingface_hub import HfApi
10
  from tqdm import tqdm
11
+ from transformers import AutoModel, AutoConfig
12
+ from accelerate import init_empty_weights
13
 
14
  from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
15
  from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
 
19
 
20
 
21
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
22
+ # load cache from disk
23
+ try:
24
+ with open("model_info_cache.pkl", "rb") as f:
25
+ model_info_cache = pickle.load(f)
26
+ except (EOFError, FileNotFoundError):
27
+ model_info_cache = {}
28
+ try:
29
+ with open("model_size_cache.pkl", "rb") as f:
30
+ model_size_cache = pickle.load(f)
31
+ except (EOFError, FileNotFoundError):
32
+ model_size_cache = {}
33
+
34
  for model_data in tqdm(leaderboard_data):
35
  model_name = model_data["model_name_for_query"]
36
+
37
+ if model_name in model_info_cache:
38
+ model_info = model_info_cache[model_name]
39
+ else:
40
+ try:
41
+ model_info = api.model_info(model_name)
42
+ model_info_cache[model_name] = model_info
43
+ except huggingface_hub.utils._errors.RepositoryNotFoundError:
44
+ print("Repo not found!", model_name)
45
+ model_data[AutoEvalColumn.license.name] = None
46
+ model_data[AutoEvalColumn.likes.name] = None
47
+ if model_name not in model_size_cache:
48
+ model_size_cache[model_name] = get_model_size(model_name, None)
49
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
50
 
51
  model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
52
  model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
53
+ if model_name not in model_size_cache:
54
+ model_size_cache[model_name] = get_model_size(model_name, model_info)
55
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
56
+
57
+ # save cache to disk in pickle format
58
+ with open("model_info_cache.pkl", "wb") as f:
59
+ pickle.dump(model_info_cache, f)
60
+ with open("model_size_cache.pkl", "wb") as f:
61
+ pickle.dump(model_size_cache, f)
62
 
63
 
64
  def get_model_license(model_info):
65
  try:
66
  return model_info.cardData["license"]
67
  except Exception:
68
+ return "?"
69
 
70
 
71
  def get_model_likes(model_info):
 
81
  return round(model_info.safetensors["total"] / 1e9, 3)
82
  except AttributeError:
83
  try:
84
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=False)
85
+ with init_empty_weights():
86
+ model = AutoModel.from_config(config, trust_remote_code=False)
87
+ return round(sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9, 3)
88
+ except (EnvironmentError, ValueError): # model config not found, likely private
89
+ try:
90
+ size_match = re.search(size_pattern, model_name.lower())
91
+ size = size_match.group(0)
92
+ return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
93
+ except AttributeError:
94
+ return 0
95
 
96
 
97
  def get_model_type(leaderboard_data: List[dict]):
src/display_models/model_metadata_flags.py CHANGED
@@ -1,15 +1,8 @@
1
  # Models which have been flagged by users as being problematic for a reason or another
2
  # (Model name to forum discussion link)
3
  FLAGGED_MODELS = {
4
- "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
- "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
- "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
- "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
8
- "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
9
- "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
10
  }
11
 
12
  # Models which have been requested by orgs to not be submitted on the leaderboard
13
  DO_NOT_SUBMIT_MODELS = [
14
- "Voicelab/trurl-2-13b", # trained on MMLU
15
  ]
 
1
  # Models which have been flagged by users as being problematic for a reason or another
2
  # (Model name to forum discussion link)
3
  FLAGGED_MODELS = {
 
 
 
 
 
 
4
  }
5
 
6
  # Models which have been requested by orgs to not be submitted on the leaderboard
7
  DO_NOT_SUBMIT_MODELS = [
 
8
  ]
src/display_models/read_results.py CHANGED
@@ -87,11 +87,11 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
87
  if len(model_split) == 1:
88
  org = None
89
  model = model_split[0]
90
- result_key = f"{model}_{model_sha}_{precision}"
91
  else:
92
  org = model_split[0]
93
  model = model_split[1]
94
- result_key = f"{org}_{model}_{model_sha}_{precision}"
95
 
96
  eval_results = []
97
  for benchmark, metric in zip(BENCHMARKS, METRICS):
 
87
  if len(model_split) == 1:
88
  org = None
89
  model = model_split[0]
90
+ result_key = f"{model}_{precision}"
91
  else:
92
  org = model_split[0]
93
  model = model_split[1]
94
+ result_key = f"{org}_{model}_{precision}"
95
 
96
  eval_results = []
97
  for benchmark, metric in zip(BENCHMARKS, METRICS):
src/load_from_hub.py CHANGED
@@ -80,11 +80,8 @@ def get_leaderboard_df(
80
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
81
  df = df[cols].round(decimals=2)
82
 
83
-
84
  # filter out if any of the benchmarks have not been produced
85
  df = df[has_no_nan_values(df, benchmark_cols)]
86
-
87
- print(df)
88
  return df
89
 
90
 
@@ -125,7 +122,7 @@ def get_evaluation_queue_df(
125
 
126
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
127
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
128
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
129
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
130
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
131
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
80
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
81
  df = df[cols].round(decimals=2)
82
 
 
83
  # filter out if any of the benchmarks have not been produced
84
  df = df[has_no_nan_values(df, benchmark_cols)]
 
 
85
  return df
86
 
87
 
 
122
 
123
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
124
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
125
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
126
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
127
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
128
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)