.gitignore CHANGED
@@ -1,19 +1,15 @@
 
1
  venv/
2
- .venv/
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
8
- .DS_Store
9
- .ruff_cache/
10
- .python-version
11
- .profile_app.python
12
- *pstats
13
- poetry.lock
14
 
 
 
15
  eval-queue/
16
  eval-results/
17
- dynamic-info/
18
 
19
  src/assets/model_counts.html
 
1
+ auto_evals/
2
  venv/
 
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
 
 
 
 
 
 
8
 
9
+ gpt_4_evals/
10
+ human_evals/
11
  eval-queue/
12
  eval-results/
13
+ auto_evals/
14
 
15
  src/assets/model_counts.html
README.md CHANGED
@@ -4,21 +4,11 @@ emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
- fullWidth: true
13
- startup_duration_timeout: 1h
14
- space_ci:
15
- private: true
16
- secrets:
17
- - HF_TOKEN
18
- - H4_TOKEN
19
- tags:
20
- - leaderboard
21
- short_description: Track, rank and evaluate open LLMs and chatbots
22
  ---
23
 
24
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  duplicated_from: HuggingFaceH4/open_llm_leaderboard
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,265 +1,291 @@
 
1
  import os
2
- import time
3
- import logging
4
  import gradio as gr
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
- from huggingface_hub import snapshot_download
8
- from gradio_space_ci import enable_space_ci
9
 
10
- from src.display.about import (
 
11
  CITATION_BUTTON_LABEL,
12
  CITATION_BUTTON_TEXT,
13
  EVALUATION_QUEUE_TEXT,
14
- FAQ_TEXT,
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
- from src.display.css_html_js import custom_css
20
- from src.display.utils import (
21
- BENCHMARK_COLS,
22
- COLS,
23
- EVAL_COLS,
24
- EVAL_TYPES,
25
- NUMERIC_INTERVALS,
26
- TYPES,
27
  AutoEvalColumn,
28
- ModelType,
29
- Precision,
30
- WeightType,
31
  fields,
 
 
 
32
  )
33
- from src.envs import (
34
- API,
35
- DYNAMIC_INFO_FILE_PATH,
36
- DYNAMIC_INFO_PATH,
37
- DYNAMIC_INFO_REPO,
38
- EVAL_REQUESTS_PATH,
39
- EVAL_RESULTS_PATH,
40
- H4_TOKEN,
41
- IS_PUBLIC,
42
- QUEUE_REPO,
43
- REPO_ID,
44
- RESULTS_REPO,
45
- )
46
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
47
- from src.scripts.update_all_request_files import update_dynamic_files
48
- from src.submission.submit import add_new_eval
49
- from src.tools.collections import update_collections
50
- from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
 
 
52
 
53
- # Configure logging
54
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
- # Start ephemeral Spaces on PRs (see config in README.md)
57
- enable_space_ci()
 
 
 
 
 
58
 
59
 
60
  def restart_space():
61
- API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
62
-
63
-
64
- def time_diff_wrapper(func):
65
- def wrapper(*args, **kwargs):
66
- start_time = time.time()
67
- result = func(*args, **kwargs)
68
- end_time = time.time()
69
- diff = end_time - start_time
70
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
71
- return result
72
- return wrapper
73
-
74
-
75
- @time_diff_wrapper
76
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
77
- """Download dataset with exponential backoff retries."""
78
- attempt = 0
79
- while attempt < max_attempts:
80
- try:
81
- logging.info(f"Downloading {repo_id} to {local_dir}")
82
- snapshot_download(
83
- repo_id=repo_id,
84
- local_dir=local_dir,
85
- repo_type=repo_type,
86
- tqdm_class=None,
87
- etag_timeout=30,
88
- max_workers=8,
89
- )
90
- logging.info("Download successful")
91
- return
92
- except Exception as e:
93
- wait_time = backoff_factor ** attempt
94
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
95
- time.sleep(wait_time)
96
- attempt += 1
97
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
98
-
99
- def init_space(full_init: bool = True):
100
- """Initializes the application space, loading only necessary data."""
101
- if full_init:
102
- # These downloads only occur on full initialization
103
- try:
104
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
105
- download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
106
- download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
107
- except Exception:
108
- restart_space()
109
-
110
- # Always retrieve the leaderboard DataFrame
111
- raw_data, original_df = get_leaderboard_df(
112
- results_path=EVAL_RESULTS_PATH,
113
- requests_path=EVAL_REQUESTS_PATH,
114
- dynamic_path=DYNAMIC_INFO_FILE_PATH,
115
- cols=COLS,
116
- benchmark_cols=BENCHMARK_COLS,
117
- )
118
 
119
- if full_init:
120
- # Collection update only happens on full initialization
121
- update_collections(original_df)
122
 
123
- leaderboard_df = original_df.copy()
124
-
125
- # Evaluation queue DataFrame retrieval is independent of initialization detail level
126
- eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
127
 
128
- return leaderboard_df, raw_data, original_df, eval_queue_dfs
 
 
 
 
129
 
130
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
131
- # This controls whether a full initialization should be performed.
132
- do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
133
 
134
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
135
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
136
- leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
137
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
138
 
 
 
 
 
 
 
 
 
 
139
 
140
- # Data processing for plots now only on demand in the respective Gradio tab
141
- def load_and_create_plots():
142
- plot_df = create_plot_df(create_scores_df(raw_data))
143
- return plot_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
 
146
  # Searching and filtering
147
  def update_table(
148
  hidden_df: pd.DataFrame,
 
149
  columns: list,
150
  type_query: list,
151
  precision_query: str,
152
  size_query: list,
153
- hide_models: list,
154
  query: str,
155
  ):
156
- filtered_df = filter_models(
157
- df=hidden_df,
158
- type_query=type_query,
159
- size_query=size_query,
160
- precision_query=precision_query,
161
- hide_models=hide_models,
162
- )
163
- filtered_df = filter_queries(query, filtered_df)
 
 
 
164
  df = select_columns(filtered_df, columns)
165
  return df
166
 
167
 
168
- def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
169
- query = request.query_params.get("query") or ""
170
- return (
171
- query,
172
- query,
173
- ) # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
174
-
175
 
176
- def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
177
- return df[(df[AutoEvalColumn.fullname.name].str.contains(query, case=False, na=False))]
178
-
179
- def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
180
- return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
181
 
182
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
183
- always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
184
- dummy_col = [AutoEvalColumn.fullname.name]
185
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
186
- return filtered_df
187
-
188
- def filter_queries(query: str, df: pd.DataFrame):
189
- tmp_result_df = []
190
-
191
- # Empty query return the same df
192
- if query == "":
193
- return df
194
-
195
- # all_queries = [q.strip() for q in query.split(";")]
196
- # license_queries = []
197
- all_queries = [q.strip() for q in query.split(";") if q.strip() != ""]
198
- model_queries = [q for q in all_queries if not q.startswith("licence")]
199
- license_queries_raw = [q for q in all_queries if q.startswith("license")]
200
- license_queries = [
201
- q.replace("license:", "").strip() for q in license_queries_raw if q.replace("license:", "").strip() != ""
202
  ]
 
 
 
 
 
203
 
204
- # Handling model name search
205
- for query in model_queries:
206
- tmp_df = search_model(df, query)
207
- if len(tmp_df) > 0:
208
- tmp_result_df.append(tmp_df)
209
-
210
- if not tmp_result_df and not license_queries:
211
- # Nothing is found, no license_queries -> return empty df
212
- return pd.DataFrame(columns=df.columns)
213
-
214
- if tmp_result_df:
215
- df = pd.concat(tmp_result_df)
216
- df = df.drop_duplicates(
217
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
218
- )
219
-
220
- if not license_queries:
221
- return df
222
-
223
- # Handling license search
224
- tmp_result_df = []
225
- for query in license_queries:
226
- tmp_df = search_license(df, query)
227
- if len(tmp_df) > 0:
228
- tmp_result_df.append(tmp_df)
229
 
230
- if not tmp_result_df:
231
- # Nothing is found, return empty df
232
- return pd.DataFrame(columns=df.columns)
233
-
234
- df = pd.concat(tmp_result_df)
235
- df = df.drop_duplicates(
236
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
237
- )
238
-
239
- return df
240
 
241
 
242
  def filter_models(
243
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, hide_models: list
244
  ) -> pd.DataFrame:
245
  # Show all models
246
- if "Private or deleted" in hide_models:
247
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
248
- else:
249
  filtered_df = df
250
-
251
- if "Contains a merge/moerge" in hide_models:
252
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
253
-
254
- if "MoE" in hide_models:
255
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
256
-
257
- if "Flagged" in hide_models:
258
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
259
 
260
  type_emoji = [t[0] for t in type_query]
261
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
262
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
263
 
264
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
265
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -269,14 +295,6 @@ def filter_models(
269
  return filtered_df
270
 
271
 
272
- leaderboard_df = filter_models(
273
- df=leaderboard_df,
274
- type_query=[t.to_str(" : ") for t in ModelType],
275
- size_query=list(NUMERIC_INTERVALS.keys()),
276
- precision_query=[i.value.name for i in Precision],
277
- hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
278
- )
279
-
280
  demo = gr.Blocks(css=custom_css)
281
  with demo:
282
  gr.HTML(TITLE)
@@ -288,65 +306,92 @@ with demo:
288
  with gr.Column():
289
  with gr.Row():
290
  search_bar = gr.Textbox(
291
- placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
292
  show_label=False,
293
  elem_id="search-bar",
294
  )
295
  with gr.Row():
296
  shown_columns = gr.CheckboxGroup(
297
  choices=[
298
- c.name
299
- for c in fields(AutoEvalColumn)
300
- if not c.hidden and not c.never_hidden and not c.dummy
 
 
 
 
 
 
301
  ],
302
  value=[
303
- c.name
304
- for c in fields(AutoEvalColumn)
305
- if c.displayed_by_default and not c.hidden and not c.never_hidden
 
 
 
 
 
 
306
  ],
307
  label="Select columns to show",
308
  elem_id="column-select",
309
  interactive=True,
310
  )
311
  with gr.Row():
312
- hide_models = gr.CheckboxGroup(
313
- label="Hide models",
314
- choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
315
- value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
316
- interactive=True,
317
  )
318
  with gr.Column(min_width=320):
319
- # with gr.Box(elem_id="box-filter"):
320
- filter_columns_type = gr.CheckboxGroup(
321
- label="Model types",
322
- choices=[t.to_str() for t in ModelType],
323
- value=[t.to_str() for t in ModelType],
324
- interactive=True,
325
- elem_id="filter-columns-type",
326
- )
327
- filter_columns_precision = gr.CheckboxGroup(
328
- label="Precision",
329
- choices=[i.value.name for i in Precision],
330
- value=[i.value.name for i in Precision],
331
- interactive=True,
332
- elem_id="filter-columns-precision",
333
- )
334
- filter_columns_size = gr.CheckboxGroup(
335
- label="Model sizes (in billions of parameters)",
336
- choices=list(NUMERIC_INTERVALS.keys()),
337
- value=list(NUMERIC_INTERVALS.keys()),
338
- interactive=True,
339
- elem_id="filter-columns-size",
340
- )
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  leaderboard_table = gr.components.Dataframe(
343
  value=leaderboard_df[
344
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
345
  + shown_columns.value
346
- + [AutoEvalColumn.fullname.name]
347
  ],
348
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
 
 
 
 
 
349
  datatype=TYPES,
 
350
  elem_id="leaderboard-table",
351
  interactive=False,
352
  visible=True,
@@ -354,121 +399,175 @@ with demo:
354
 
355
  # Dummy leaderboard for handling the case when the user uses backspace key
356
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
357
- value=original_df[COLS],
358
  headers=COLS,
359
  datatype=TYPES,
 
360
  visible=False,
361
  )
362
  search_bar.submit(
363
  update_table,
364
  [
365
  hidden_leaderboard_table_for_search,
 
366
  shown_columns,
367
  filter_columns_type,
368
  filter_columns_precision,
369
  filter_columns_size,
370
- hide_models,
371
  search_bar,
372
  ],
373
  leaderboard_table,
374
  )
375
-
376
- # Define a hidden component that will trigger a reload only if a query parameter has been set
377
- hidden_search_bar = gr.Textbox(value="", visible=False)
378
- hidden_search_bar.change(
379
  update_table,
380
  [
381
  hidden_leaderboard_table_for_search,
 
382
  shown_columns,
383
  filter_columns_type,
384
  filter_columns_precision,
385
  filter_columns_size,
386
- hide_models,
387
  search_bar,
388
  ],
389
  leaderboard_table,
 
390
  )
391
- # Check query parameter once at startup and update search bar + hidden component
392
- demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
393
-
394
- for selector in [
395
- shown_columns,
396
- filter_columns_type,
397
- filter_columns_precision,
398
- filter_columns_size,
399
- hide_models,
400
- ]:
401
- selector.change(
402
- update_table,
403
- [
404
- hidden_leaderboard_table_for_search,
405
- shown_columns,
406
- filter_columns_type,
407
- filter_columns_precision,
408
- filter_columns_size,
409
- hide_models,
410
- search_bar,
411
- ],
412
  leaderboard_table,
413
- queue=True,
414
- )
415
-
416
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
417
- with gr.Row():
418
- with gr.Column():
419
- plot_df = load_and_create_plots()
420
- chart = create_metric_plot_obj(
421
- plot_df,
422
- [AutoEvalColumn.average.name],
423
- title="Average of Top Scores and Human Baseline Over Time (from last update)",
424
- )
425
- gr.Plot(value=chart, min_width=500)
426
- with gr.Column():
427
- plot_df = load_and_create_plots()
428
- chart = create_metric_plot_obj(
429
- plot_df,
430
- BENCHMARK_COLS,
431
- title="Top Scores and Human Baseline Over Time (from last update)",
432
- )
433
- gr.Plot(value=chart, min_width=500)
434
-
435
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
437
 
438
- with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
439
- gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
440
-
441
- with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
442
  with gr.Column():
443
  with gr.Row():
444
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  with gr.Row():
447
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
448
 
449
  with gr.Row():
450
  with gr.Column():
451
  model_name_textbox = gr.Textbox(label="Model name")
452
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
453
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
454
  model_type = gr.Dropdown(
455
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
 
 
 
456
  label="Model type",
457
  multiselect=False,
458
- value=ModelType.FT.to_str(" : "),
459
  interactive=True,
460
  )
461
 
462
  with gr.Column():
463
  precision = gr.Dropdown(
464
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
465
  label="Precision",
466
  multiselect=False,
467
  value="float16",
468
  interactive=True,
469
  )
470
  weight_type = gr.Dropdown(
471
- choices=[i.value.name for i in WeightType],
472
  label="Weights type",
473
  multiselect=False,
474
  value="Original",
@@ -476,42 +575,6 @@ with demo:
476
  )
477
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
478
 
479
- with gr.Column():
480
- with gr.Accordion(
481
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
482
- open=False,
483
- ):
484
- with gr.Row():
485
- finished_eval_table = gr.components.Dataframe(
486
- value=finished_eval_queue_df,
487
- headers=EVAL_COLS,
488
- datatype=EVAL_TYPES,
489
- row_count=5,
490
- )
491
- with gr.Accordion(
492
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
493
- open=False,
494
- ):
495
- with gr.Row():
496
- running_eval_table = gr.components.Dataframe(
497
- value=running_eval_queue_df,
498
- headers=EVAL_COLS,
499
- datatype=EVAL_TYPES,
500
- row_count=5,
501
- )
502
-
503
- with gr.Accordion(
504
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
505
- open=False,
506
- ):
507
- with gr.Row():
508
- pending_eval_table = gr.components.Dataframe(
509
- value=pending_eval_queue_df,
510
- headers=EVAL_COLS,
511
- datatype=EVAL_TYPES,
512
- row_count=5,
513
- )
514
-
515
  submit_button = gr.Button("Submit Eval")
516
  submission_result = gr.Markdown()
517
  submit_button.click(
@@ -535,12 +598,17 @@ with demo:
535
  label=CITATION_BUTTON_LABEL,
536
  lines=20,
537
  elem_id="citation-button",
538
- show_copy_button=True,
539
- )
 
 
 
 
 
 
 
540
 
541
  scheduler = BackgroundScheduler()
542
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
543
- scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
544
  scheduler.start()
545
-
546
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import json
2
  import os
3
+ from datetime import datetime, timezone
4
+
5
  import gradio as gr
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
+ from huggingface_hub import HfApi
 
9
 
10
+ from src.assets.css_html_js import custom_css, get_window_url_params
11
+ from src.assets.text_content import (
12
  CITATION_BUTTON_LABEL,
13
  CITATION_BUTTON_TEXT,
14
  EVALUATION_QUEUE_TEXT,
 
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
+ from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
20
+ from src.display_models.modelcard_filter import check_model_card
21
+ from src.display_models.utils import (
 
 
 
 
 
22
  AutoEvalColumn,
23
+ EvalQueueColumn,
 
 
24
  fields,
25
+ styled_error,
26
+ styled_message,
27
+ styled_warning,
28
  )
29
+ from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
30
+ from src.rate_limiting import user_submission_permission
31
+
32
+ pd.set_option("display.precision", 1)
33
+
34
+ # clone / pull the lmeh eval data
35
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
36
+
37
+ QUEUE_REPO = "open-llm-leaderboard/requests"
38
+ RESULTS_REPO = "open-llm-leaderboard/results"
 
 
 
 
 
 
 
 
39
 
40
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
41
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
42
 
43
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 
44
 
45
+ EVAL_REQUESTS_PATH = "eval-queue"
46
+ EVAL_RESULTS_PATH = "eval-results"
47
+
48
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
49
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
50
+
51
+ api = HfApi(token=H4_TOKEN)
52
 
53
 
54
  def restart_space():
55
+ api.restart_space(repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
 
 
 
57
 
58
+ # Rate limit variables
59
+ RATE_LIMIT_PERIOD = 7
60
+ RATE_LIMIT_QUOTA = 5
 
61
 
62
+ # Column selection
63
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
64
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
65
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
66
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
67
 
68
+ if not IS_PUBLIC:
69
+ COLS.insert(2, AutoEvalColumn.precision.name)
70
+ TYPES.insert(2, AutoEvalColumn.precision.type)
71
 
72
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
73
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
 
74
 
75
+ BENCHMARK_COLS = [
76
+ c.name
77
+ for c in [
78
+ AutoEvalColumn.arc,
79
+ AutoEvalColumn.hellaswag,
80
+ AutoEvalColumn.mmlu,
81
+ AutoEvalColumn.truthfulqa,
82
+ ]
83
+ ]
84
 
85
+ ## LOAD INFO FROM HUB
86
+ eval_queue, requested_models, eval_results, users_to_submission_dates = load_all_info_from_hub(
87
+ QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
88
+ )
89
+
90
+ if not IS_PUBLIC:
91
+ (eval_queue_private, requested_models_private, eval_results_private, _) = load_all_info_from_hub(
92
+ PRIVATE_QUEUE_REPO,
93
+ PRIVATE_RESULTS_REPO,
94
+ EVAL_REQUESTS_PATH_PRIVATE,
95
+ EVAL_RESULTS_PATH_PRIVATE,
96
+ )
97
+ else:
98
+ eval_queue_private, eval_results_private = None, None
99
+
100
+ original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
101
+ models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
102
+
103
+ to_be_dumped = f"models = {repr(models)}\n"
104
+
105
+ leaderboard_df = original_df.copy()
106
+ (
107
+ finished_eval_queue_df,
108
+ running_eval_queue_df,
109
+ pending_eval_queue_df,
110
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
111
+
112
+
113
+ ## INTERACTION FUNCTIONS
114
+ def add_new_eval(
115
+ model: str,
116
+ base_model: str,
117
+ revision: str,
118
+ precision: str,
119
+ private: bool,
120
+ weight_type: str,
121
+ model_type: str,
122
+ ):
123
+ precision = precision.split(" ")[0]
124
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
125
+
126
+ if model_type is None or model_type == "":
127
+ return styled_error("Please select a model type.")
128
+
129
+ # Is the user rate limited?
130
+ num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
131
+ if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
132
+ error_msg = f"Organisation or user `{model.split('/')[0]}`"
133
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
134
+ error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
135
+ error_msg += (
136
+ "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
137
+ )
138
+ return styled_error(error_msg)
139
+
140
+ # Did the model authors forbid its submission to the leaderboard?
141
+ if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
142
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
143
+
144
+ # Does the model actually exist?
145
+ if revision == "":
146
+ revision = "main"
147
+
148
+ if weight_type in ["Delta", "Adapter"]:
149
+ base_model_on_hub, error = is_model_on_hub(base_model, revision)
150
+ if not base_model_on_hub:
151
+ return styled_error(f'Base model "{base_model}" {error}')
152
+
153
+ if not weight_type == "Adapter":
154
+ model_on_hub, error = is_model_on_hub(model, revision)
155
+ if not model_on_hub:
156
+ return styled_error(f'Model "{model}" {error}')
157
+
158
+ # Were the model card and license filled?
159
+ modelcard_OK, error_msg = check_model_card(model)
160
+ if not modelcard_OK:
161
+ return styled_error(error_msg)
162
+
163
+ # Seems good, creating the eval
164
+ print("Adding new eval")
165
+
166
+ eval_entry = {
167
+ "model": model,
168
+ "base_model": base_model,
169
+ "revision": revision,
170
+ "private": private,
171
+ "precision": precision,
172
+ "weight_type": weight_type,
173
+ "status": "PENDING",
174
+ "submitted_time": current_time,
175
+ "model_type": model_type,
176
+ }
177
+
178
+ user_name = ""
179
+ model_path = model
180
+ if "/" in model:
181
+ user_name = model.split("/")[0]
182
+ model_path = model.split("/")[1]
183
+
184
+ print("Creating eval file")
185
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
186
+ os.makedirs(OUT_DIR, exist_ok=True)
187
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
188
+
189
+ # Check for duplicate submission
190
+ if f"{model}_{revision}_{precision}" in requested_models:
191
+ return styled_warning("This model has been already submitted.")
192
+
193
+ with open(out_path, "w") as f:
194
+ f.write(json.dumps(eval_entry))
195
+
196
+ print("Uploading eval file")
197
+ api.upload_file(
198
+ path_or_fileobj=out_path,
199
+ path_in_repo=out_path.split("eval-queue/")[1],
200
+ repo_id=QUEUE_REPO,
201
+ repo_type="dataset",
202
+ commit_message=f"Add {model} to eval queue",
203
+ )
204
+
205
+ # Remove the local file
206
+ os.remove(out_path)
207
+
208
+ return styled_message(
209
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
210
+ )
211
+
212
+
213
+ # Basics
214
+ def change_tab(query_param: str):
215
+ query_param = query_param.replace("'", '"')
216
+ query_param = json.loads(query_param)
217
+
218
+ if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
219
+ return gr.Tabs.update(selected=1)
220
+ else:
221
+ return gr.Tabs.update(selected=0)
222
 
223
 
224
  # Searching and filtering
225
  def update_table(
226
  hidden_df: pd.DataFrame,
227
+ current_columns_df: pd.DataFrame,
228
  columns: list,
229
  type_query: list,
230
  precision_query: str,
231
  size_query: list,
232
+ show_deleted: bool,
233
  query: str,
234
  ):
235
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
236
+ final_df = []
237
+ if query != "":
238
+ queries = query.split(";")
239
+ for _q in queries:
240
+ if _q != "":
241
+ temp_filtered_df = search_table(filtered_df, _q)
242
+ if len(temp_filtered_df) > 0:
243
+ final_df.append(temp_filtered_df)
244
+ if len(final_df) > 0:
245
+ filtered_df = pd.concat(final_df).drop_duplicates()
246
  df = select_columns(filtered_df, columns)
247
  return df
248
 
249
 
250
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
251
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 
 
 
 
 
252
 
 
 
 
 
 
253
 
254
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
255
+ always_here_cols = [
256
+ AutoEvalColumn.model_type_symbol.name,
257
+ AutoEvalColumn.model.name,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  ]
259
+ # We use COLS to maintain sorting
260
+ filtered_df = df[
261
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
262
+ ]
263
+ return filtered_df
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ NUMERIC_INTERVALS = {
267
+ "Unknown": pd.Interval(-1, 0, closed="right"),
268
+ "< 1.5B": pd.Interval(0, 1.5, closed="right"),
269
+ "~3B": pd.Interval(1.5, 5, closed="right"),
270
+ "~7B": pd.Interval(6, 11, closed="right"),
271
+ "~13B": pd.Interval(12, 15, closed="right"),
272
+ "~35B": pd.Interval(16, 55, closed="right"),
273
+ "60B+": pd.Interval(55, 10000, closed="right"),
274
+ }
 
275
 
276
 
277
  def filter_models(
278
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
279
  ) -> pd.DataFrame:
280
  # Show all models
281
+ if show_deleted:
 
 
282
  filtered_df = df
283
+ else: # Show only still on the hub models
284
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
 
 
 
 
 
 
 
285
 
286
  type_emoji = [t[0] for t in type_query]
287
+ filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
288
+ filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
289
 
290
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
291
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 
295
  return filtered_df
296
 
297
 
 
 
 
 
 
 
 
 
298
  demo = gr.Blocks(css=custom_css)
299
  with demo:
300
  gr.HTML(TITLE)
 
306
  with gr.Column():
307
  with gr.Row():
308
  search_bar = gr.Textbox(
309
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
310
  show_label=False,
311
  elem_id="search-bar",
312
  )
313
  with gr.Row():
314
  shown_columns = gr.CheckboxGroup(
315
  choices=[
316
+ c
317
+ for c in COLS
318
+ if c
319
+ not in [
320
+ AutoEvalColumn.dummy.name,
321
+ AutoEvalColumn.model.name,
322
+ AutoEvalColumn.model_type_symbol.name,
323
+ AutoEvalColumn.still_on_hub.name,
324
+ ]
325
  ],
326
  value=[
327
+ c
328
+ for c in COLS_LITE
329
+ if c
330
+ not in [
331
+ AutoEvalColumn.dummy.name,
332
+ AutoEvalColumn.model.name,
333
+ AutoEvalColumn.model_type_symbol.name,
334
+ AutoEvalColumn.still_on_hub.name,
335
+ ]
336
  ],
337
  label="Select columns to show",
338
  elem_id="column-select",
339
  interactive=True,
340
  )
341
  with gr.Row():
342
+ deleted_models_visibility = gr.Checkbox(
343
+ value=True, label="Show gated/private/deleted models", interactive=True
 
 
 
344
  )
345
  with gr.Column(min_width=320):
346
+ with gr.Box(elem_id="box-filter"):
347
+ filter_columns_type = gr.CheckboxGroup(
348
+ label="Model types",
349
+ choices=[
350
+ ModelType.PT.to_str(),
351
+ ModelType.FT.to_str(),
352
+ ModelType.IFT.to_str(),
353
+ ModelType.RL.to_str(),
354
+ ModelType.Unknown.to_str(),
355
+ ],
356
+ value=[
357
+ ModelType.PT.to_str(),
358
+ ModelType.FT.to_str(),
359
+ ModelType.IFT.to_str(),
360
+ ModelType.RL.to_str(),
361
+ ModelType.Unknown.to_str(),
362
+ ],
363
+ interactive=True,
364
+ elem_id="filter-columns-type",
365
+ )
366
+ filter_columns_precision = gr.CheckboxGroup(
367
+ label="Precision",
368
+ choices=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
369
+ value=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
370
+ interactive=True,
371
+ elem_id="filter-columns-precision",
372
+ )
373
+ filter_columns_size = gr.CheckboxGroup(
374
+ label="Model sizes",
375
+ choices=list(NUMERIC_INTERVALS.keys()),
376
+ value=list(NUMERIC_INTERVALS.keys()),
377
+ interactive=True,
378
+ elem_id="filter-columns-size",
379
+ )
380
 
381
  leaderboard_table = gr.components.Dataframe(
382
  value=leaderboard_df[
383
+ [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
384
  + shown_columns.value
385
+ + [AutoEvalColumn.dummy.name]
386
  ],
387
+ headers=[
388
+ AutoEvalColumn.model_type_symbol.name,
389
+ AutoEvalColumn.model.name,
390
+ ]
391
+ + shown_columns.value
392
+ + [AutoEvalColumn.dummy.name],
393
  datatype=TYPES,
394
+ max_rows=None,
395
  elem_id="leaderboard-table",
396
  interactive=False,
397
  visible=True,
 
399
 
400
  # Dummy leaderboard for handling the case when the user uses backspace key
401
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
402
+ value=original_df,
403
  headers=COLS,
404
  datatype=TYPES,
405
+ max_rows=None,
406
  visible=False,
407
  )
408
  search_bar.submit(
409
  update_table,
410
  [
411
  hidden_leaderboard_table_for_search,
412
+ leaderboard_table,
413
  shown_columns,
414
  filter_columns_type,
415
  filter_columns_precision,
416
  filter_columns_size,
417
+ deleted_models_visibility,
418
  search_bar,
419
  ],
420
  leaderboard_table,
421
  )
422
+ shown_columns.change(
 
 
 
423
  update_table,
424
  [
425
  hidden_leaderboard_table_for_search,
426
+ leaderboard_table,
427
  shown_columns,
428
  filter_columns_type,
429
  filter_columns_precision,
430
  filter_columns_size,
431
+ deleted_models_visibility,
432
  search_bar,
433
  ],
434
  leaderboard_table,
435
+ queue=True,
436
  )
437
+ filter_columns_type.change(
438
+ update_table,
439
+ [
440
+ hidden_leaderboard_table_for_search,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  leaderboard_table,
442
+ shown_columns,
443
+ filter_columns_type,
444
+ filter_columns_precision,
445
+ filter_columns_size,
446
+ deleted_models_visibility,
447
+ search_bar,
448
+ ],
449
+ leaderboard_table,
450
+ queue=True,
451
+ )
452
+ filter_columns_precision.change(
453
+ update_table,
454
+ [
455
+ hidden_leaderboard_table_for_search,
456
+ leaderboard_table,
457
+ shown_columns,
458
+ filter_columns_type,
459
+ filter_columns_precision,
460
+ filter_columns_size,
461
+ deleted_models_visibility,
462
+ search_bar,
463
+ ],
464
+ leaderboard_table,
465
+ queue=True,
466
+ )
467
+ filter_columns_size.change(
468
+ update_table,
469
+ [
470
+ hidden_leaderboard_table_for_search,
471
+ leaderboard_table,
472
+ shown_columns,
473
+ filter_columns_type,
474
+ filter_columns_precision,
475
+ filter_columns_size,
476
+ deleted_models_visibility,
477
+ search_bar,
478
+ ],
479
+ leaderboard_table,
480
+ queue=True,
481
+ )
482
+ deleted_models_visibility.change(
483
+ update_table,
484
+ [
485
+ hidden_leaderboard_table_for_search,
486
+ leaderboard_table,
487
+ shown_columns,
488
+ filter_columns_type,
489
+ filter_columns_precision,
490
+ filter_columns_size,
491
+ deleted_models_visibility,
492
+ search_bar,
493
+ ],
494
+ leaderboard_table,
495
+ queue=True,
496
+ )
497
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
498
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
499
 
500
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
501
  with gr.Column():
502
  with gr.Row():
503
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
504
 
505
+ with gr.Column():
506
+ with gr.Accordion(
507
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
508
+ open=False,
509
+ ):
510
+ with gr.Row():
511
+ finished_eval_table = gr.components.Dataframe(
512
+ value=finished_eval_queue_df,
513
+ headers=EVAL_COLS,
514
+ datatype=EVAL_TYPES,
515
+ max_rows=5,
516
+ )
517
+ with gr.Accordion(
518
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
519
+ open=False,
520
+ ):
521
+ with gr.Row():
522
+ running_eval_table = gr.components.Dataframe(
523
+ value=running_eval_queue_df,
524
+ headers=EVAL_COLS,
525
+ datatype=EVAL_TYPES,
526
+ max_rows=5,
527
+ )
528
+
529
+ with gr.Accordion(
530
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
531
+ open=False,
532
+ ):
533
+ with gr.Row():
534
+ pending_eval_table = gr.components.Dataframe(
535
+ value=pending_eval_queue_df,
536
+ headers=EVAL_COLS,
537
+ datatype=EVAL_TYPES,
538
+ max_rows=5,
539
+ )
540
  with gr.Row():
541
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
542
 
543
  with gr.Row():
544
  with gr.Column():
545
  model_name_textbox = gr.Textbox(label="Model name")
546
+ revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
547
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
548
  model_type = gr.Dropdown(
549
+ choices=[
550
+ ModelType.PT.to_str(" : "),
551
+ ModelType.FT.to_str(" : "),
552
+ ModelType.IFT.to_str(" : "),
553
+ ModelType.RL.to_str(" : "),
554
+ ],
555
  label="Model type",
556
  multiselect=False,
557
+ value=None,
558
  interactive=True,
559
  )
560
 
561
  with gr.Column():
562
  precision = gr.Dropdown(
563
+ choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ"],
564
  label="Precision",
565
  multiselect=False,
566
  value="float16",
567
  interactive=True,
568
  )
569
  weight_type = gr.Dropdown(
570
+ choices=["Original", "Delta", "Adapter"],
571
  label="Weights type",
572
  multiselect=False,
573
  value="Original",
 
575
  )
576
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  submit_button = gr.Button("Submit Eval")
579
  submission_result = gr.Markdown()
580
  submit_button.click(
 
598
  label=CITATION_BUTTON_LABEL,
599
  lines=20,
600
  elem_id="citation-button",
601
+ ).style(show_copy_button=True)
602
+
603
+ dummy = gr.Textbox(visible=False)
604
+ demo.load(
605
+ change_tab,
606
+ dummy,
607
+ tabs,
608
+ _js=get_window_url_params,
609
+ )
610
 
611
  scheduler = BackgroundScheduler()
612
+ scheduler.add_job(restart_space, "interval", seconds=1800)
 
613
  scheduler.start()
614
+ demo.queue(concurrency_count=40).launch()
 
model_info_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fcaa2a3e1ac6a5559471547af5de4e3ccd49673ad5525890726e65cd90cfe62
3
+ size 3620752
model_size_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75d1f64589459eb64e3a50987bf05ed3656248102d1fe2f6c98a008020945840
3
+ size 74321
src/tools/model_backlinks.py → models_backlinks.py RENAMED
File without changes
pyproject.toml CHANGED
@@ -1,15 +1,9 @@
1
  [tool.ruff]
2
- line-length = 120
3
- target-version = "py312"
4
- include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
5
- ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004","D107","FA102"]
6
- fixable=["ALL"]
7
- select=["ALL"]
8
-
9
- [tool.ruff.lint]
10
  select = ["E", "F"]
11
- fixable = ["ALL"]
12
  ignore = ["E501"] # line too long (black is taking care of this)
 
 
13
 
14
  [tool.isort]
15
  profile = "black"
@@ -17,37 +11,3 @@ line_length = 119
17
 
18
  [tool.black]
19
  line-length = 119
20
-
21
- [tool.poetry]
22
- package-mode = false
23
- name = "open-llm-leaderboard"
24
- version = "0.1.0"
25
- description = ""
26
- authors = []
27
- readme = "README.md"
28
-
29
- [tool.poetry.dependencies]
30
- python = "3.12.1"
31
- apscheduler = "3.10.1"
32
- black = "23.11.0"
33
- click = "8.1.3"
34
- datasets = "2.14.5"
35
- huggingface-hub = ">=0.18.0"
36
- matplotlib = "3.8.4"
37
- numpy = "1.26.0"
38
- pandas = "2.2.2"
39
- plotly = "5.14.1"
40
- python-dateutil = "2.8.2"
41
- requests = "2.28.2"
42
- sentencepiece = "^0.2.0"
43
- tqdm = "4.65.0"
44
- transformers = "4.40.0"
45
- tokenizers = ">=0.15.0"
46
- gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
47
- gradio = "4.9.0"
48
- isort = "^5.13.2"
49
- ruff = "^0.3.5"
50
-
51
- [build-system]
52
- requires = ["poetry-core"]
53
- build-backend = "poetry.core.masonry.api"
 
1
  [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 
 
 
 
 
 
 
3
  select = ["E", "F"]
 
4
  ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
 
11
 
12
  [tool.black]
13
  line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,16 +1,71 @@
 
 
 
 
 
 
1
  APScheduler==3.10.1
2
- black==23.11.0
 
 
 
3
  click==8.1.3
4
- datasets==2.14.5
5
- huggingface-hub>=0.18.0
6
- matplotlib==3.8.4
7
- numpy==1.26.0
8
- pandas==2.2.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  plotly==5.14.1
 
 
 
 
 
10
  python-dateutil==2.8.2
 
 
 
 
11
  requests==2.28.2
12
- sentencepiece
 
 
 
 
13
  tqdm==4.65.0
14
- transformers==4.40.0
15
- tokenizers>=0.15.0
16
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 
 
 
 
 
 
 
1
+ accelerate==0.23.0
2
+ aiofiles==23.1.0
3
+ aiohttp==3.8.4
4
+ aiosignal==1.3.1
5
+ altair==4.2.2
6
+ anyio==3.6.2
7
  APScheduler==3.10.1
8
+ async-timeout==4.0.2
9
+ attrs==23.1.0
10
+ certifi==2022.12.7
11
+ charset-normalizer==3.1.0
12
  click==8.1.3
13
+ contourpy==1.0.7
14
+ cycler==0.11.0
15
+ datasets==2.12.0
16
+ entrypoints==0.4
17
+ fastapi==0.95.1
18
+ ffmpy==0.3.0
19
+ filelock==3.11.0
20
+ fonttools==4.39.3
21
+ frozenlist==1.3.3
22
+ fsspec==2023.4.0
23
+ gradio==3.43.2
24
+ gradio-client==0.5.0
25
+ h11==0.14.0
26
+ httpcore==0.17.0
27
+ httpx==0.24.0
28
+ huggingface-hub==0.16.4
29
+ idna==3.4
30
+ Jinja2==3.1.2
31
+ jsonschema==4.17.3
32
+ kiwisolver==1.4.4
33
+ linkify-it-py==2.0.0
34
+ markdown-it-py==2.2.0
35
+ MarkupSafe==2.1.2
36
+ matplotlib==3.7.1
37
+ mdit-py-plugins==0.3.3
38
+ mdurl==0.1.2
39
+ multidict==6.0.4
40
+ numpy==1.24.2
41
+ orjson==3.8.10
42
+ packaging==23.1
43
+ pandas==2.0.0
44
+ Pillow==9.5.0
45
  plotly==5.14.1
46
+ pyarrow==11.0.0
47
+ pydantic==1.10.7
48
+ pydub==0.25.1
49
+ pyparsing==3.0.9
50
+ pyrsistent==0.19.3
51
  python-dateutil==2.8.2
52
+ python-multipart==0.0.6
53
+ pytz==2023.3
54
+ pytz-deprecation-shim==0.1.0.post0
55
+ PyYAML==6.0
56
  requests==2.28.2
57
+ semantic-version==2.10.0
58
+ six==1.16.0
59
+ sniffio==1.3.0
60
+ starlette==0.26.1
61
+ toolz==0.12.0
62
  tqdm==4.65.0
63
+ transformers@git+https://github.com/huggingface/transformers
64
+ typing_extensions==4.5.0
65
+ tzdata==2023.3
66
+ tzlocal==4.3
67
+ uc-micro-py==1.0.1
68
+ urllib3==1.26.15
69
+ uvicorn==0.21.1
70
+ websockets==11.0.1
71
+ yarl==1.8.2
src/{display → assets}/css_html_js.py RENAMED
@@ -1,25 +1,5 @@
1
  custom_css = """
2
 
3
- /* Hides the final AutoEvalColumn */
4
- #llm-benchmark-tab-table table td:last-child,
5
- #llm-benchmark-tab-table table th:last-child {
6
- display: none;
7
- }
8
-
9
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
10
- table td:first-child,
11
- table th:first-child {
12
- max-width: 400px;
13
- overflow: auto;
14
- white-space: nowrap;
15
- }
16
-
17
- /* Full width space */
18
- .gradio-container {
19
- max-width: 95%!important;
20
- }
21
-
22
- /* Text style and margins */
23
  .markdown-text {
24
  font-size: 16px !important;
25
  }
@@ -41,20 +21,53 @@ table th:first-child {
41
  transform: scale(1.3);
42
  }
43
 
 
 
 
 
 
 
 
 
44
  #search-bar-table-box > div:first-child {
45
  background: none;
46
  border: none;
47
  }
48
-
49
  #search-bar {
50
  padding: 0px;
51
  }
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  .tab-buttons button {
54
  font-size: 20px;
55
  }
56
 
57
- /* Filters style */
 
 
 
 
 
 
 
 
 
 
 
58
  #filter_type{
59
  border: 0;
60
  padding-left: 0;
@@ -95,4 +108,4 @@ get_window_url_params = """
95
  url_params = Object.fromEntries(params);
96
  return url_params;
97
  }
98
- """
 
1
  custom_css = """
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
 
21
  transform: scale(1.3);
22
  }
23
 
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
  #search-bar-table-box > div:first-child {
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
40
 
41
+ /* Hides the final AutoEvalColumn */
42
+ #llm-benchmark-tab-table table td:last-child,
43
+ #llm-benchmark-tab-table table th:last-child {
44
+ display: none;
45
+ }
46
+
47
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
+ table td:first-child,
49
+ table th:first-child {
50
+ max-width: 400px;
51
+ overflow: auto;
52
+ white-space: nowrap;
53
+ }
54
+
55
  .tab-buttons button {
56
  font-size: 20px;
57
  }
58
 
59
+ #scale-logo {
60
+ border-style: none !important;
61
+ box-shadow: none;
62
+ display: block;
63
+ margin-left: auto;
64
+ margin-right: auto;
65
+ max-width: 600px;
66
+ }
67
+
68
+ #scale-logo .download {
69
+ display: none;
70
+ }
71
  #filter_type{
72
  border: 0;
73
  padding-left: 0;
 
108
  url_params = Object.fromEntries(params);
109
  return url_params;
110
  }
111
+ """
src/assets/hardcoded_evals.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
+
3
+ gpt4_values = {
4
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
+ AutoEvalColumn.revision.name: "tech report",
6
+ AutoEvalColumn.precision.name: None,
7
+ AutoEvalColumn.average.name: 84.3,
8
+ AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
+ AutoEvalColumn.dummy.name: "GPT-4",
13
+ AutoEvalColumn.model_type.name: "",
14
+ }
15
+
16
+ gpt35_values = {
17
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
18
+ AutoEvalColumn.revision.name: "tech report",
19
+ AutoEvalColumn.precision.name: None,
20
+ AutoEvalColumn.average.name: 71.9,
21
+ AutoEvalColumn.arc.name: 85.2,
22
+ AutoEvalColumn.hellaswag.name: 85.5,
23
+ AutoEvalColumn.mmlu.name: 70.0,
24
+ AutoEvalColumn.truthfulqa.name: 47.0,
25
+ AutoEvalColumn.dummy.name: "GPT-3.5",
26
+ AutoEvalColumn.model_type.name: "",
27
+ }
28
+
29
+ baseline = {
30
+ AutoEvalColumn.model.name: "<p>Baseline</p>",
31
+ AutoEvalColumn.revision.name: "N/A",
32
+ AutoEvalColumn.precision.name: None,
33
+ AutoEvalColumn.average.name: 25.0,
34
+ AutoEvalColumn.arc.name: 25.0,
35
+ AutoEvalColumn.hellaswag.name: 25.0,
36
+ AutoEvalColumn.mmlu.name: 25.0,
37
+ AutoEvalColumn.truthfulqa.name: 25.0,
38
+ AutoEvalColumn.dummy.name: "baseline",
39
+ AutoEvalColumn.model_type.name: "",
40
+ }
src/assets/scale-hf-logo.png ADDED

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB
src/{display/about.py → assets/text_content.py} RENAMED
@@ -1,62 +1,52 @@
1
- from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 style="text-align:left;float:left; id="space-title">🤗 Open LLM Leaderboard</h1> <h3 style="text-align:left;float:left;> Track, rank and evaluate open LLMs and chatbots </h3>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- """
7
 
8
- icons = f"""
9
- - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given text corpora using masked modelling
10
- - {ModelType.CPT.to_str(" : ")} model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
11
- - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
12
- - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
13
- - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
14
  """
15
- LLM_BENCHMARKS_TEXT = """
16
- ## ABOUT
 
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
19
- 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
20
- The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
 
 
 
 
21
 
22
- ### Tasks
23
- 📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 
 
 
 
24
 
25
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
26
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
27
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
28
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA is technically a 6-shot task in the Harness because each example is prepended with 6 Q/A pairs, even in the 0-shot setting.
29
- - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
30
- - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
31
 
32
  For all these evaluations, a higher score is a better score.
33
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
34
 
35
- ### Results
36
  You can find:
37
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
38
- - details on the input/outputs for the models in the `details` of each model, which you can access by clicking the 📄 emoji after the model name
39
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
40
 
41
- If a model's name contains "Flagged", this indicates it has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
42
-
43
- ---------------------------
 
44
 
45
- ## REPRODUCIBILITY
46
- To reproduce our results, here are the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
47
- `python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
48
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
49
-
50
- ```
51
- python main.py --model=hf-causal-experimental \
52
- --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>" \
53
- --tasks=<task_list> \
54
- --num_fewshot=<n_few_shot> \
55
- --batch_size=1 \
56
- --output_path=<output_path>
57
- ```
58
-
59
- **Note:** We evaluate all models on a single node of 8 H100s, so the global batch size is 8 for each evaluation. If you don't use parallelism, adapt your batch size to fit.
60
  *You can expect results to vary slightly for different batch sizes because of padding.*
61
 
62
  The tasks and few shots parameters are:
@@ -64,122 +54,23 @@ The tasks and few shots parameters are:
64
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
65
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
66
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
67
- - Winogrande: 5-shot, *winogrande* (`acc`)
68
- - GSM8k: 5-shot, *gsm8k* (`acc`)
69
-
70
- Side note on the baseline scores:
71
- - for log-likelihood evaluation, we select the random baseline
72
- - for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
73
-
74
- ---------------------------
75
-
76
- ## RESOURCES
77
 
78
- ### Quantization
79
  To get more information about quantization, see:
80
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
81
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
82
 
83
- ### Useful links
84
- - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
85
- - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
86
-
87
- ### Other cool leaderboards:
88
- - [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
89
- - [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
90
-
91
-
92
  """
93
 
94
- FAQ_TEXT = """
95
-
96
- ## SUBMISSIONS
97
- My model requires `trust_remote_code=True`, can I submit it?
98
- - *We only support models that have been integrated into a stable version of the `transformers` library for automatic submission, as we don't want to run possibly unsafe code on our cluster.*
99
-
100
- What about models of type X?
101
- - *We only support models that have been integrated into a stable version of the `transformers` library for automatic submission.*
102
-
103
- How can I follow when my model is launched?
104
- - *You can look for its request file [here](https://huggingface.co/datasets/open-llm-leaderboard/requests) and follow the status evolution, or directly in the queues above the submit form.*
105
-
106
- My model disappeared from all the queues, what happened?
107
- - *A model disappearing from all the queues usually means that there has been a failure. You can check if that is the case by looking for your model [here](https://huggingface.co/datasets/open-llm-leaderboard/requests).*
108
-
109
- What causes an evaluation failure?
110
- - *Most of the failures we get come from problems in the submissions (corrupted files, config problems, wrong parameters selected for eval ...), so we'll be grateful if you first make sure you have followed the steps in `About`. However, from time to time, we have failures on our side (hardware/node failures, problems with an update of our backend, connectivity problems ending up in the results not being saved, ...).*
111
-
112
- How can I report an evaluation failure?
113
- - *As we store the logs for all models, feel free to create an issue, **where you link to the requests file of your model** (look for it [here](https://huggingface.co/datasets/open-llm-leaderboard/requests/tree/main)), so we can investigate! If the model failed due to a problem on our side, we'll relaunch it right away!*
114
- *Note: Please do not re-upload your model under a different name, it will not help*
115
-
116
- ---------------------------
117
-
118
- ## RESULTS
119
- What kind of information can I find?
120
- - *Let's imagine you are interested in the Yi-34B results. You have access to 3 different information categories:*
121
- - *The [request file](https://huggingface.co/datasets/open-llm-leaderboard/requests/blob/main/01-ai/Yi-34B_eval_request_False_bfloat16_Original.json): it gives you information about the status of the evaluation*
122
- - *The [aggregated results folder](https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/01-ai/Yi-34B): it gives you aggregated scores, per experimental run*
123
- - *The [details dataset](https://huggingface.co/datasets/open-llm-leaderboard/details_01-ai__Yi-34B/tree/main): it gives you the full details (scores and examples for each task and a given model)*
124
-
125
-
126
- Why do models appear several times in the leaderboard?
127
- - *We run evaluations with user-selected precision and model commit. Sometimes, users submit specific models at different commits and at different precisions (for example, in float16 and 4bit to see how quantization affects performance). You should be able to verify this by displaying the `precision` and `model sha` columns in the display. If, however, you see models appearing several times with the same precision and hash commit, this is not normal.*
128
-
129
- What is this concept of "flagging"?
130
- - *This mechanism allows users to report models that have unfair performance on the leaderboard. This contains several categories: exceedingly good results on the leaderboard because the model was (maybe accidentally) trained on the evaluation data, models that are copies of other models not attributed properly, etc.*
131
-
132
- My model has been flagged improperly, what can I do?
133
- - *Every flagged model has a discussion associated with it - feel free to plead your case there, and we'll see what to do together with the community.*
134
-
135
- ---------------------------
136
-
137
- ## HOW TO SEARCH FOR A MODEL
138
- Search for models in the leaderboard by:
139
- 1. Name, e.g., *model_name*
140
- 2. Multiple names, separated by `;`, e.g., *model_name1;model_name2*
141
- 3. License, prefix with `license:`, e.g., *license: MIT*
142
- 4. Combination of name and license, order is irrelevant, e.g., *model_name; license: cc-by-sa-4.0*
143
-
144
- ---------------------------
145
-
146
- ## EDITING SUBMISSIONS
147
- I upgraded my model and want to re-submit, how can I do that?
148
- - *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*
149
-
150
- I need to rename my model, how can I do that?
151
- - *You can use @Weyaxi 's [super cool tool](https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-renamer) to request model name changes, then open a discussion where you link to the created pull request, and we'll check them and merge them as needed.*
152
-
153
- ---------------------------
154
-
155
- ## OTHER
156
- Why do you differentiate between pretrained, continuously pretrained, fine-tuned, merges, etc?
157
- - *These different models do not play in the same categories, and therefore need to be separated for fair comparison. Base pretrained models are the most interesting for the community, as they are usually good models to fine-tune later on - any jump in performance from a pretrained model represents a true improvement on the SOTA.
158
- Fine-tuned and IFT/RLHF/chat models usually have better performance, but the latter might be more sensitive to system prompts, which we do not cover at the moment in the Open LLM Leaderboard.
159
- Merges and moerges have artificially inflated performance on test sets, which is not always explainable, and does not always apply to real-world situations.*
160
-
161
- What should I use the leaderboard for?
162
- - *We recommend using the leaderboard for 3 use cases: 1) getting an idea of the state of open pretrained models, by looking only at the ranks and score of this category; 2) experimenting with different fine-tuning methods, datasets, quantization techniques, etc, and comparing their score in a reproducible setup, and 3) checking the performance of a model of interest to you, wrt to other models of its category.*
163
-
164
- Why don't you display closed-source model scores?
165
- - *This is a leaderboard for Open models, both for philosophical reasons (openness is cool) and for practical reasons: we want to ensure that the results we display are accurate and reproducible, but 1) commercial closed models can change their API thus rendering any scoring at a given time incorrect 2) we re-run everything on our cluster to ensure all models are run on the same setup and you can't do that for these models.*
166
-
167
- I have an issue with accessing the leaderboard through the Gradio API
168
- - *Since this is not the recommended way to access the leaderboard, we won't provide support for this, but you can look at tools provided by the community for inspiration!*
169
-
170
- I have another problem, help!
171
- - *Please open an issue in the discussion tab, and we'll do our best to help you in a timely manner :) *
172
- """
173
-
174
-
175
- EVALUATION_QUEUE_TEXT = f"""
176
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
177
 
178
  Models added here will be automatically evaluated on the 🤗 cluster.
179
 
180
- ## Don't forget to read the FAQ and the About tabs for more information!
181
-
182
- ## First steps before submitting a model
183
 
184
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
185
  ```python
@@ -202,17 +93,16 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
202
  ### 4) Fill up your model card
203
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
204
 
205
- ### 5) Select the correct precision
206
- Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
207
-
208
- ## Model types
209
- {icons}
210
  """
211
 
212
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
213
  CITATION_BUTTON_TEXT = r"""
214
  @misc{open-llm-leaderboard,
215
- author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
216
  title = {Open LLM Leaderboard},
217
  year = {2023},
218
  publisher = {Hugging Face},
@@ -275,32 +165,4 @@ CITATION_BUTTON_TEXT = r"""
275
  eprint={2109.07958},
276
  archivePrefix={arXiv},
277
  primaryClass={cs.CL}
278
- }
279
- @misc{DBLP:journals/corr/abs-1907-10641,
280
- title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
281
- author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
282
- year={2019},
283
- eprint={1907.10641},
284
- archivePrefix={arXiv},
285
- primaryClass={cs.CL}
286
- }
287
- @misc{DBLP:journals/corr/abs-2110-14168,
288
- title={Training Verifiers to Solve Math Word Problems},
289
- author={Karl Cobbe and
290
- Vineet Kosaraju and
291
- Mohammad Bavarian and
292
- Mark Chen and
293
- Heewoo Jun and
294
- Lukasz Kaiser and
295
- Matthias Plappert and
296
- Jerry Tworek and
297
- Jacob Hilton and
298
- Reiichiro Nakano and
299
- Christopher Hesse and
300
- John Schulman},
301
- year={2021},
302
- eprint={2110.14168},
303
- archivePrefix={arXiv},
304
- primaryClass={cs.CL}
305
- }
306
- """
 
1
+ from src.display_models.model_metadata_type import ModelType
2
 
3
+ TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
 
8
+ 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
+ The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 
 
 
 
10
  """
11
+
12
+ LLM_BENCHMARKS_TEXT = f"""
13
+ # Context
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
+ ## Icons
17
+ {ModelType.PT.to_str(" : ")} model
18
+ {ModelType.FT.to_str(" : ")} model
19
+ {ModelType.IFT.to_str(" : ")} model
20
+ {ModelType.RL.to_str(" : ")} model
21
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
22
 
23
+ 🏴‍☠️ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
24
+ (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
25
+
26
+ ## How it works
27
+
28
+ 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
30
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
31
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
32
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
33
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a models propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 
 
34
 
35
  For all these evaluations, a higher score is a better score.
36
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
+ ## Details and logs
39
  You can find:
40
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
41
+ - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
42
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
43
 
44
+ ## Reproducibility
45
+ To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
46
+ `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
47
+ ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
48
 
49
+ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  *You can expect results to vary slightly for different batch sizes because of padding.*
51
 
52
  The tasks and few shots parameters are:
 
54
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
55
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
56
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 
 
 
 
 
 
 
 
 
 
57
 
58
+ ## Quantization
59
  To get more information about quantization, see:
60
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
61
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
62
 
63
+ ## More resources
64
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179)!
65
+ We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
 
 
 
 
 
 
66
  """
67
 
68
+ EVALUATION_QUEUE_TEXT = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
70
 
71
  Models added here will be automatically evaluated on the 🤗 cluster.
72
 
73
+ ## Some good practices before submitting a model
 
 
74
 
75
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
76
  ```python
 
93
  ### 4) Fill up your model card
94
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
95
 
96
+ ## In case of model failure
97
+ If your model is displayed in the `FAILED` category, its execution stopped.
98
+ Make sure you have followed the above steps first.
99
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
100
  """
101
 
102
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
103
  CITATION_BUTTON_TEXT = r"""
104
  @misc{open-llm-leaderboard,
105
+ author = {Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
106
  title = {Open LLM Leaderboard},
107
  year = {2023},
108
  publisher = {Hugging Face},
 
165
  eprint={2109.07958},
166
  archivePrefix={arXiv},
167
  primaryClass={cs.CL}
168
+ }"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,36 +0,0 @@
1
- from huggingface_hub import HfApi
2
-
3
- API = HfApi()
4
-
5
-
6
- def model_hyperlink(link, model_name):
7
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
-
9
-
10
- def make_clickable_model(model_name):
11
- link = f"https://huggingface.co/{model_name}"
12
-
13
- details_model_name = model_name.replace("/", "__")
14
- details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
15
-
16
- return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
17
-
18
-
19
- def styled_error(error):
20
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
21
-
22
-
23
- def styled_warning(warn):
24
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
25
-
26
-
27
- def styled_message(message):
28
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
29
-
30
-
31
- def has_no_nan_values(df, columns):
32
- return df[columns].notna().all(axis=1)
33
-
34
-
35
- def has_nan_values(df, columns):
36
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,233 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
- import json
4
- import logging
5
- from datetime import datetime
6
- import pandas as pd
7
-
8
-
9
- # Configure logging
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
-
12
- def parse_datetime(datetime_str):
13
- formats = [
14
- "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
15
- "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
16
- "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
17
- ]
18
-
19
- for fmt in formats:
20
- try:
21
- return datetime.strptime(datetime_str, fmt)
22
- except ValueError:
23
- continue
24
- # in rare cases set unix start time for files with incorrect time (legacy files)
25
- logging.error(f"No valid date format found for: {datetime_str}")
26
- return datetime(1970, 1, 1)
27
-
28
- def load_json_data(file_path):
29
- """Safely load JSON data from a file."""
30
- try:
31
- with open(file_path, "r") as file:
32
- return json.load(file)
33
- except json.JSONDecodeError:
34
- print(f"Error reading JSON from {file_path}")
35
- return None # Or raise an exception
36
-
37
-
38
- def fields(raw_class):
39
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
40
-
41
-
42
- @dataclass
43
- class Task:
44
- benchmark: str
45
- metric: str
46
- col_name: str
47
-
48
-
49
- class Tasks(Enum):
50
- arc = Task("arc:challenge", "acc_norm", "ARC")
51
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
52
- mmlu = Task("hendrycksTest", "acc", "MMLU")
53
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
54
- winogrande = Task("winogrande", "acc", "Winogrande")
55
- gsm8k = Task("gsm8k", "acc", "GSM8K")
56
-
57
-
58
- # These classes are for user facing column names,
59
- # to avoid having to change them all around the code
60
- # when a modif is needed
61
- @dataclass(frozen=True)
62
- class ColumnContent:
63
- name: str
64
- type: str
65
- displayed_by_default: bool
66
- hidden: bool = False
67
- never_hidden: bool = False
68
- dummy: bool = False
69
-
70
-
71
- auto_eval_column_dict = []
72
- # Init
73
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
74
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
75
- # Scores
76
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
77
- for task in Tasks:
78
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
79
- # Model information
80
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
81
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
82
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
83
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
84
- auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
85
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
86
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
87
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
88
- auto_eval_column_dict.append(
89
- ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
90
- )
91
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
93
- auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
94
- # Dummy column for the search bar (hidden by the custom CSS)
95
- auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
96
-
97
- # We use make dataclass to dynamically fill the scores from Tasks
98
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
99
-
100
-
101
-
102
- @dataclass(frozen=True)
103
- class EvalQueueColumn: # Queue column
104
- model = ColumnContent("model", "markdown", True)
105
- revision = ColumnContent("revision", "str", True)
106
- private = ColumnContent("private", "bool", True)
107
- precision = ColumnContent("precision", "str", True)
108
- weight_type = ColumnContent("weight_type", "str", "Original")
109
- status = ColumnContent("status", "str", True)
110
-
111
-
112
- baseline_row = {
113
- AutoEvalColumn.model.name: "<p>Baseline</p>",
114
- AutoEvalColumn.revision.name: "N/A",
115
- AutoEvalColumn.precision.name: None,
116
- AutoEvalColumn.merged.name: False,
117
- AutoEvalColumn.average.name: 31.0,
118
- AutoEvalColumn.arc.name: 25.0,
119
- AutoEvalColumn.hellaswag.name: 25.0,
120
- AutoEvalColumn.mmlu.name: 25.0,
121
- AutoEvalColumn.truthfulqa.name: 25.0,
122
- AutoEvalColumn.winogrande.name: 50.0,
123
- AutoEvalColumn.gsm8k.name: 0.21,
124
- AutoEvalColumn.fullname.name: "baseline",
125
- AutoEvalColumn.model_type.name: "",
126
- AutoEvalColumn.flagged.name: False,
127
- }
128
-
129
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
130
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
131
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
132
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
133
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
134
- # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
135
- # GSM8K: paper
136
- # Define the human baselines
137
- human_baseline_row = {
138
- AutoEvalColumn.model.name: "<p>Human performance</p>",
139
- AutoEvalColumn.revision.name: "N/A",
140
- AutoEvalColumn.precision.name: None,
141
- AutoEvalColumn.average.name: 92.75,
142
- AutoEvalColumn.merged.name: False,
143
- AutoEvalColumn.arc.name: 80.0,
144
- AutoEvalColumn.hellaswag.name: 95.0,
145
- AutoEvalColumn.mmlu.name: 89.8,
146
- AutoEvalColumn.truthfulqa.name: 94.0,
147
- AutoEvalColumn.winogrande.name: 94.0,
148
- AutoEvalColumn.gsm8k.name: 100,
149
- AutoEvalColumn.fullname.name: "human_baseline",
150
- AutoEvalColumn.model_type.name: "",
151
- AutoEvalColumn.flagged.name: False,
152
- }
153
-
154
-
155
- @dataclass
156
- class ModelDetails:
157
- name: str
158
- symbol: str = "" # emoji, only for the model type
159
-
160
-
161
- class ModelType(Enum):
162
- PT = ModelDetails(name="pretrained", symbol="🟢")
163
- CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
164
- FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
165
- chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
166
- merges = ModelDetails(name="base merges and moerges", symbol="🤝")
167
- Unknown = ModelDetails(name="", symbol="?")
168
-
169
- def to_str(self, separator=" "):
170
- return f"{self.value.symbol}{separator}{self.value.name}"
171
-
172
- @staticmethod
173
- def from_str(type):
174
- if "fine-tuned" in type or "🔶" in type:
175
- return ModelType.FT
176
- if "continously pretrained" in type or "🟩" in type:
177
- return ModelType.CPT
178
- if "pretrained" in type or "🟢" in type:
179
- return ModelType.PT
180
- if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
181
- return ModelType.chat
182
- if "merge" in type or "🤝" in type:
183
- return ModelType.merges
184
- return ModelType.Unknown
185
-
186
-
187
- class WeightType(Enum):
188
- Adapter = ModelDetails("Adapter")
189
- Original = ModelDetails("Original")
190
- Delta = ModelDetails("Delta")
191
-
192
-
193
- class Precision(Enum):
194
- float16 = ModelDetails("float16")
195
- bfloat16 = ModelDetails("bfloat16")
196
- qt_8bit = ModelDetails("8bit")
197
- qt_4bit = ModelDetails("4bit")
198
- qt_GPTQ = ModelDetails("GPTQ")
199
- Unknown = ModelDetails("?")
200
-
201
- def from_str(precision):
202
- if precision in ["torch.float16", "float16"]:
203
- return Precision.float16
204
- if precision in ["torch.bfloat16", "bfloat16"]:
205
- return Precision.bfloat16
206
- if precision in ["8bit"]:
207
- return Precision.qt_8bit
208
- if precision in ["4bit"]:
209
- return Precision.qt_4bit
210
- if precision in ["GPTQ", "None"]:
211
- return Precision.qt_GPTQ
212
- return Precision.Unknown
213
-
214
-
215
- # Column selection
216
- COLS = [c.name for c in fields(AutoEvalColumn)]
217
- TYPES = [c.type for c in fields(AutoEvalColumn)]
218
-
219
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
220
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
221
-
222
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
223
-
224
- NUMERIC_INTERVALS = {
225
- "?": pd.Interval(-1, 0, closed="right"),
226
- "~1.5": pd.Interval(0, 2, closed="right"),
227
- "~3": pd.Interval(2, 4, closed="right"),
228
- "~7": pd.Interval(4, 9, closed="right"),
229
- "~13": pd.Interval(9, 20, closed="right"),
230
- "~35": pd.Interval(20, 45, closed="right"),
231
- "~60": pd.Interval(45, 70, closed="right"),
232
- "70+": pd.Interval(70, 10000, closed="right"),
233
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display_models/get_model_metadata.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import os
4
+ import re
5
+ import pickle
6
+ from typing import List
7
+
8
+ import huggingface_hub
9
+ from huggingface_hub import HfApi
10
+ from tqdm import tqdm
11
+ from transformers import AutoModel, AutoConfig
12
+ from accelerate import init_empty_weights
13
+
14
+ from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
15
+ from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
16
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
17
+
18
+ api = HfApi(token=os.environ.get("H4_TOKEN", None))
19
+
20
+
21
+ def get_model_infos_from_hub(leaderboard_data: List[dict]):
22
+ # load cache from disk
23
+ try:
24
+ with open("model_info_cache.pkl", "rb") as f:
25
+ model_info_cache = pickle.load(f)
26
+ except (EOFError, FileNotFoundError):
27
+ model_info_cache = {}
28
+ try:
29
+ with open("model_size_cache.pkl", "rb") as f:
30
+ model_size_cache = pickle.load(f)
31
+ except (EOFError, FileNotFoundError):
32
+ model_size_cache = {}
33
+
34
+ for model_data in tqdm(leaderboard_data):
35
+ model_name = model_data["model_name_for_query"]
36
+
37
+ if model_name in model_info_cache:
38
+ model_info = model_info_cache[model_name]
39
+ else:
40
+ try:
41
+ model_info = api.model_info(model_name)
42
+ model_info_cache[model_name] = model_info
43
+ except huggingface_hub.utils._errors.RepositoryNotFoundError:
44
+ print("Repo not found!", model_name)
45
+ model_data[AutoEvalColumn.license.name] = None
46
+ model_data[AutoEvalColumn.likes.name] = None
47
+ if model_name not in model_size_cache:
48
+ model_size_cache[model_name] = get_model_size(model_name, None)
49
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
50
+
51
+ model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
52
+ model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
53
+ if model_name not in model_size_cache:
54
+ model_size_cache[model_name] = get_model_size(model_name, model_info)
55
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
56
+
57
+ # save cache to disk in pickle format
58
+ with open("model_info_cache.pkl", "wb") as f:
59
+ pickle.dump(model_info_cache, f)
60
+ with open("model_size_cache.pkl", "wb") as f:
61
+ pickle.dump(model_size_cache, f)
62
+
63
+
64
+ def get_model_license(model_info):
65
+ try:
66
+ return model_info.cardData["license"]
67
+ except Exception:
68
+ return "?"
69
+
70
+
71
+ def get_model_likes(model_info):
72
+ return model_info.likes
73
+
74
+
75
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
76
+
77
+
78
+ def get_model_size(model_name, model_info):
79
+ # In billions
80
+ try:
81
+ return round(model_info.safetensors["total"] / 1e9, 3)
82
+ except AttributeError:
83
+ try:
84
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=False)
85
+ with init_empty_weights():
86
+ model = AutoModel.from_config(config, trust_remote_code=False)
87
+ return round(sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9, 3)
88
+ except (EnvironmentError, ValueError, KeyError): # model config not found, likely private
89
+ try:
90
+ size_match = re.search(size_pattern, model_name.lower())
91
+ size = size_match.group(0)
92
+ return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
93
+ except AttributeError:
94
+ return 0
95
+
96
+
97
+ def get_model_type(leaderboard_data: List[dict]):
98
+ for model_data in leaderboard_data:
99
+ request_files = os.path.join(
100
+ "eval-queue",
101
+ model_data["model_name_for_query"] + "_eval_request_*" + ".json",
102
+ )
103
+ request_files = glob.glob(request_files)
104
+
105
+ # Select correct request file (precision)
106
+ request_file = ""
107
+ if len(request_files) == 1:
108
+ request_file = request_files[0]
109
+ elif len(request_files) > 1:
110
+ request_files = sorted(request_files, reverse=True)
111
+ for tmp_request_file in request_files:
112
+ with open(tmp_request_file, "r") as f:
113
+ req_content = json.load(f)
114
+ if (
115
+ req_content["status"] == "FINISHED"
116
+ and req_content["precision"] == model_data["Precision"].split(".")[-1]
117
+ ):
118
+ request_file = tmp_request_file
119
+
120
+ try:
121
+ with open(request_file, "r") as f:
122
+ request = json.load(f)
123
+ model_type = model_type_from_str(request["model_type"])
124
+ model_data[AutoEvalColumn.model_type.name] = model_type.value.name
125
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
126
+ except Exception:
127
+ if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
128
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
129
+ model_data["model_name_for_query"]
130
+ ].value.name
131
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
132
+ model_data["model_name_for_query"]
133
+ ].value.symbol # + ("🔺" if is_delta else "")
134
+ else:
135
+ model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
136
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
137
+
138
+
139
+ def flag_models(leaderboard_data: List[dict]):
140
+ for model_data in leaderboard_data:
141
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
142
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
143
+ issue_link = model_hyperlink(
144
+ FLAGGED_MODELS[model_data["model_name_for_query"]],
145
+ f"See discussion #{issue_num}",
146
+ )
147
+ model_data[
148
+ AutoEvalColumn.model.name
149
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
150
+
151
+
152
+ def remove_forbidden_models(leaderboard_data: List[dict]):
153
+ indices_to_remove = []
154
+ for ix, model in enumerate(leaderboard_data):
155
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
156
+ indices_to_remove.append(ix)
157
+
158
+ for ix in reversed(indices_to_remove):
159
+ leaderboard_data.pop(ix)
160
+ return leaderboard_data
161
+
162
+
163
+ def apply_metadata(leaderboard_data: List[dict]):
164
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
165
+ get_model_type(leaderboard_data)
166
+ get_model_infos_from_hub(leaderboard_data)
167
+ flag_models(leaderboard_data)
src/display_models/model_metadata_flags.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models which have been flagged by users as being problematic for a reason or another
2
+ # (Model name to forum discussion link)
3
+ FLAGGED_MODELS = {
4
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
+ "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
8
+ "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
9
+ "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
10
+ "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
11
+ "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
12
+ "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
13
+ }
14
+
15
+ # Models which have been requested by orgs to not be submitted on the leaderboard
16
+ DO_NOT_SUBMIT_MODELS = [
17
+ "Voicelab/trurl-2-13b", # trained on MMLU
18
+ ]
src/display_models/model_metadata_type.py ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict
4
+
5
+
6
+ @dataclass
7
+ class ModelInfo:
8
+ name: str
9
+ symbol: str # emoji
10
+
11
+
12
+ class ModelType(Enum):
13
+ PT = ModelInfo(name="pretrained", symbol="🟢")
14
+ FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
+ IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
+ RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
+ Unknown = ModelInfo(name="Unknown", symbol="?")
18
+
19
+ def to_str(self, separator=" "):
20
+ return f"{self.value.symbol}{separator}{self.value.name}"
21
+
22
+
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
+ "tiiuae/falcon-180B": ModelType.PT,
25
+ "tiiuae/falcon-180B-chat": ModelType.RL,
26
+ "microsoft/phi-1_5": ModelType.PT,
27
+ "Qwen/Qwen-7B": ModelType.PT,
28
+ "Qwen/Qwen-7B-Chat": ModelType.RL,
29
+ "notstoic/PygmalionCoT-7b": ModelType.IFT,
30
+ "aisquared/dlite-v1-355m": ModelType.IFT,
31
+ "aisquared/dlite-v1-1_5b": ModelType.IFT,
32
+ "aisquared/dlite-v1-774m": ModelType.IFT,
33
+ "aisquared/dlite-v1-124m": ModelType.IFT,
34
+ "aisquared/chopt-2_7b": ModelType.IFT,
35
+ "aisquared/dlite-v2-124m": ModelType.IFT,
36
+ "aisquared/dlite-v2-774m": ModelType.IFT,
37
+ "aisquared/dlite-v2-1_5b": ModelType.IFT,
38
+ "aisquared/chopt-1_3b": ModelType.IFT,
39
+ "aisquared/dlite-v2-355m": ModelType.IFT,
40
+ "augtoma/qCammel-13": ModelType.IFT,
41
+ "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
42
+ "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
43
+ "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
44
+ "TheBloke/tulu-7B-fp16": ModelType.IFT,
45
+ "TheBloke/guanaco-7B-HF": ModelType.FT,
46
+ "TheBloke/koala-7B-HF": ModelType.FT,
47
+ "TheBloke/wizardLM-7B-HF": ModelType.IFT,
48
+ "TheBloke/airoboros-13B-HF": ModelType.IFT,
49
+ "TheBloke/koala-13B-HF": ModelType.FT,
50
+ "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
51
+ "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
52
+ "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
53
+ "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
54
+ "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
55
+ "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
56
+ "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
57
+ "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
58
+ "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
59
+ "TheBloke/guanaco-13B-HF": ModelType.FT,
60
+ "TheBloke/guanaco-65B-HF": ModelType.FT,
61
+ "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
62
+ "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
63
+ "TheBloke/Llama-2-13B-fp16": ModelType.PT,
64
+ "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
65
+ "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
66
+ "TheBloke/Planner-7B-fp16": ModelType.IFT,
67
+ "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
68
+ "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
69
+ "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
70
+ "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
71
+ "TheBloke/tulu-13B-fp16": ModelType.IFT,
72
+ "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
73
+ "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
74
+ "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
75
+ "TheBloke/robin-13B-v2-fp16": ModelType.FT,
76
+ "TheBloke/robin-33B-v2-fp16": ModelType.FT,
77
+ "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
78
+ "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
79
+ "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
80
+ "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
81
+ "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
82
+ "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
83
+ "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
84
+ "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
85
+ "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
86
+ "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
87
+ "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
88
+ "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
89
+ "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
90
+ "concedo/OPT-19M-ChatSalad": ModelType.FT,
91
+ "concedo/Pythia-70M-ChatSalad": ModelType.FT,
92
+ "digitous/13B-HyperMantis": ModelType.IFT,
93
+ "digitous/Adventien-GPTJ": ModelType.FT,
94
+ "digitous/Alpacino13b": ModelType.IFT,
95
+ "digitous/GPT-R": ModelType.IFT,
96
+ "digitous/Javelin-R": ModelType.IFT,
97
+ "digitous/Javalion-GPTJ": ModelType.IFT,
98
+ "digitous/Javalion-R": ModelType.IFT,
99
+ "digitous/Skegma-GPTJ": ModelType.FT,
100
+ "digitous/Alpacino30b": ModelType.IFT,
101
+ "digitous/Janin-GPTJ": ModelType.FT,
102
+ "digitous/Janin-R": ModelType.FT,
103
+ "digitous/Javelin-GPTJ": ModelType.FT,
104
+ "SaylorTwift/gpt2_test": ModelType.PT,
105
+ "anton-l/gpt-j-tiny-random": ModelType.FT,
106
+ "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
107
+ "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
108
+ "Lazycuber/Janemalion-6B": ModelType.FT,
109
+ "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
110
+ "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
111
+ "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
112
+ "gpt2-medium": ModelType.PT,
113
+ "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
114
+ "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
115
+ "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
116
+ "PygmalionAI/pygmalion-6b": ModelType.FT,
117
+ "PygmalionAI/metharme-1.3b": ModelType.IFT,
118
+ "PygmalionAI/pygmalion-1.3b": ModelType.FT,
119
+ "PygmalionAI/pygmalion-350m": ModelType.FT,
120
+ "PygmalionAI/pygmalion-2.7b": ModelType.FT,
121
+ "medalpaca/medalpaca-7b": ModelType.FT,
122
+ "lilloukas/Platypus-30B": ModelType.IFT,
123
+ "lilloukas/GPlatty-30B": ModelType.FT,
124
+ "mncai/chatdoctor": ModelType.FT,
125
+ "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
126
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
127
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
128
+ "hakurei/instruct-12b": ModelType.IFT,
129
+ "hakurei/lotus-12B": ModelType.FT,
130
+ "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
131
+ "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
132
+ "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
133
+ "mosaicml/mpt-7b-instruct": ModelType.IFT,
134
+ "mosaicml/mpt-30b-chat": ModelType.IFT,
135
+ "mosaicml/mpt-7b-storywriter": ModelType.FT,
136
+ "mosaicml/mpt-30b-instruct": ModelType.IFT,
137
+ "mosaicml/mpt-7b-chat": ModelType.IFT,
138
+ "mosaicml/mpt-30b": ModelType.PT,
139
+ "Corianas/111m": ModelType.IFT,
140
+ "Corianas/Quokka_1.3b": ModelType.IFT,
141
+ "Corianas/256_5epoch": ModelType.FT,
142
+ "Corianas/Quokka_256m": ModelType.IFT,
143
+ "Corianas/Quokka_590m": ModelType.IFT,
144
+ "Corianas/gpt-j-6B-Dolly": ModelType.FT,
145
+ "Corianas/Quokka_2.7b": ModelType.IFT,
146
+ "cyberagent/open-calm-7b": ModelType.FT,
147
+ "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
148
+ "THUDM/chatglm2-6b": ModelType.IFT,
149
+ "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
150
+ "NYTK/PULI-GPTrio": ModelType.PT,
151
+ "EleutherAI/pythia-1.3b": ModelType.PT,
152
+ "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
153
+ "EleutherAI/gpt-neo-125m": ModelType.PT,
154
+ "EleutherAI/pythia-160m": ModelType.PT,
155
+ "EleutherAI/gpt-neo-2.7B": ModelType.PT,
156
+ "EleutherAI/pythia-1b-deduped": ModelType.PT,
157
+ "EleutherAI/pythia-6.7b": ModelType.PT,
158
+ "EleutherAI/pythia-70m-deduped": ModelType.PT,
159
+ "EleutherAI/gpt-neox-20b": ModelType.PT,
160
+ "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
161
+ "EleutherAI/pythia-2.7b": ModelType.PT,
162
+ "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
163
+ "EleutherAI/pythia-70m": ModelType.PT,
164
+ "EleutherAI/gpt-j-6b": ModelType.PT,
165
+ "EleutherAI/pythia-12b-deduped": ModelType.PT,
166
+ "EleutherAI/gpt-neo-1.3B": ModelType.PT,
167
+ "EleutherAI/pythia-410m-deduped": ModelType.PT,
168
+ "EleutherAI/pythia-160m-deduped": ModelType.PT,
169
+ "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
170
+ "EleutherAI/pythia-12b": ModelType.PT,
171
+ "roneneldan/TinyStories-33M": ModelType.PT,
172
+ "roneneldan/TinyStories-28M": ModelType.PT,
173
+ "roneneldan/TinyStories-1M": ModelType.PT,
174
+ "roneneldan/TinyStories-8M": ModelType.PT,
175
+ "roneneldan/TinyStories-3M": ModelType.PT,
176
+ "jerryjalapeno/nart-100k-7b": ModelType.FT,
177
+ "lmsys/vicuna-13b-v1.3": ModelType.IFT,
178
+ "lmsys/vicuna-7b-v1.3": ModelType.IFT,
179
+ "lmsys/vicuna-13b-v1.1": ModelType.IFT,
180
+ "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
181
+ "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
182
+ "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
183
+ "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
184
+ "Gryphe/MythoLogic-13b": ModelType.IFT,
185
+ "Gryphe/MythoBoros-13b": ModelType.IFT,
186
+ "pillowtalks-ai/delta13b": ModelType.FT,
187
+ "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
188
+ "bigscience/bloom-7b1": ModelType.PT,
189
+ "bigcode/tiny_starcoder_py": ModelType.PT,
190
+ "bigcode/starcoderplus": ModelType.FT,
191
+ "bigcode/gpt_bigcode-santacoder": ModelType.PT,
192
+ "bigcode/starcoder": ModelType.PT,
193
+ "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
194
+ "microsoft/DialoGPT-large": ModelType.FT,
195
+ "microsoft/DialoGPT-small": ModelType.FT,
196
+ "microsoft/DialoGPT-medium": ModelType.FT,
197
+ "microsoft/CodeGPT-small-py": ModelType.FT,
198
+ "Tincando/fiction_story_generator": ModelType.FT,
199
+ "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
200
+ "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
201
+ "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
202
+ "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
203
+ "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
204
+ "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
205
+ "illuin/test-custom-llama": ModelType.FT,
206
+ "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
207
+ "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
208
+ "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
209
+ "dvruette/llama-13b-pretrained": ModelType.PT,
210
+ "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
211
+ "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
212
+ "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
213
+ "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
214
+ "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
215
+ "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
216
+ "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
217
+ "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
218
+ "openlm-research/open_llama_7b": ModelType.PT,
219
+ "openlm-research/open_llama_7b_v2": ModelType.PT,
220
+ "openlm-research/open_llama_3b": ModelType.PT,
221
+ "openlm-research/open_llama_13b": ModelType.PT,
222
+ "openlm-research/open_llama_3b_v2": ModelType.PT,
223
+ "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
224
+ "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
225
+ "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
226
+ "databricks/dolly-v2-7b": ModelType.IFT,
227
+ "databricks/dolly-v2-3b": ModelType.IFT,
228
+ "databricks/dolly-v2-12b": ModelType.IFT,
229
+ "Rachneet/gpt2-xl-alpaca": ModelType.FT,
230
+ "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
231
+ "psyche/kogpt": ModelType.FT,
232
+ "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
233
+ "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
234
+ "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
235
+ "Fredithefish/CrimsonPajama": ModelType.IFT,
236
+ "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
237
+ "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
238
+ "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
239
+ "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
240
+ "eachadea/vicuna-13b-1.1": ModelType.FT,
241
+ "eachadea/vicuna-7b-1.1": ModelType.FT,
242
+ "eachadea/vicuna-13b": ModelType.FT,
243
+ "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
244
+ "openaccess-ai-collective/manticore-13b": ModelType.IFT,
245
+ "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
246
+ "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
247
+ "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
248
+ "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
249
+ "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
250
+ "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
251
+ "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
252
+ "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
253
+ "stabilityai/StableBeluga1-Delta": ModelType.IFT,
254
+ "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
255
+ "stabilityai/StableBeluga2": ModelType.IFT,
256
+ "stabilityai/StableBeluga-13B": ModelType.IFT,
257
+ "stabilityai/StableBeluga-7B": ModelType.IFT,
258
+ "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
259
+ "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
260
+ "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
261
+ "alibidaran/medical_transcription_generator": ModelType.FT,
262
+ "CalderaAI/30B-Lazarus": ModelType.IFT,
263
+ "CalderaAI/13B-BlueMethod": ModelType.IFT,
264
+ "CalderaAI/13B-Ouroboros": ModelType.IFT,
265
+ "KoboldAI/OPT-13B-Erebus": ModelType.FT,
266
+ "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
267
+ "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
268
+ "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
269
+ "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
270
+ "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
271
+ "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
272
+ "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
273
+ "KoboldAI/fairseq-dense-125M": ModelType.PT,
274
+ "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
275
+ "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
276
+ "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
277
+ "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
278
+ "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
279
+ "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
280
+ "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
281
+ "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
282
+ "KoboldAI/fairseq-dense-355M": ModelType.PT,
283
+ "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
284
+ "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
285
+ "KoboldAI/OPT-350M-Erebus": ModelType.FT,
286
+ "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
287
+ "KoboldAI/OPT-30B-Erebus": ModelType.FT,
288
+ "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
289
+ "klosax/open_llama_3b_350bt_preview": ModelType.PT,
290
+ "klosax/openllama-3b-350bt": ModelType.PT,
291
+ "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
292
+ "klosax/open_llama_13b_600bt_preview": ModelType.PT,
293
+ "klosax/open_llama_7b_400bt_preview": ModelType.PT,
294
+ "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
295
+ "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
296
+ "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
297
+ "TFLai/gpt2-turkish-uncased": ModelType.FT,
298
+ "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
299
+ "ehartford/dolphin-llama-13b": ModelType.IFT,
300
+ "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
301
+ "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
302
+ "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
303
+ "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
304
+ "ehartford/based-30b": ModelType.FT,
305
+ "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
306
+ "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
307
+ "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
308
+ "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
309
+ "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
310
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
311
+ "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
312
+ "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
313
+ "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
314
+ "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
315
+ "junelee/wizard-vicuna-13b": ModelType.FT,
316
+ "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
317
+ "BreadAi/MuseCan": ModelType.PT,
318
+ "BreadAi/MusePy-1-2": ModelType.PT,
319
+ "BreadAi/DiscordPy": ModelType.PT,
320
+ "BreadAi/PM_modelV2": ModelType.PT,
321
+ "BreadAi/gpt-Youtube": ModelType.PT,
322
+ "BreadAi/StoryPy": ModelType.FT,
323
+ "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
324
+ "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
325
+ "AGI-inc/lora_moe_7b": ModelType.FT,
326
+ "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
327
+ "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
328
+ "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
329
+ "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
330
+ "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
331
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
332
+ "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
333
+ "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
334
+ "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
335
+ "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
336
+ "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
337
+ "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
338
+ "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
339
+ "Writer/camel-5b-hf": ModelType.IFT,
340
+ "Writer/palmyra-base": ModelType.PT,
341
+ "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
342
+ "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
343
+ "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
344
+ "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
345
+ "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
346
+ "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
347
+ "MBZUAI/lamini-neo-125m": ModelType.IFT,
348
+ "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
349
+ "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
350
+ "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
351
+ "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
352
+ "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
353
+ "TehVenom/Dolly_Malion-6b": ModelType.FT,
354
+ "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
355
+ "TehVenom/ChanMalion": ModelType.FT,
356
+ "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
357
+ "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
358
+ "TehVenom/Metharme-13b-Merged": ModelType.IFT,
359
+ "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
360
+ "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
361
+ "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
362
+ "vicgalle/gpt2-alpaca": ModelType.IFT,
363
+ "vicgalle/alpaca-7b": ModelType.FT,
364
+ "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
365
+ "facebook/opt-350m": ModelType.PT,
366
+ "facebook/opt-125m": ModelType.PT,
367
+ "facebook/xglm-4.5B": ModelType.PT,
368
+ "facebook/opt-2.7b": ModelType.PT,
369
+ "facebook/opt-6.7b": ModelType.PT,
370
+ "facebook/galactica-30b": ModelType.PT,
371
+ "facebook/opt-13b": ModelType.PT,
372
+ "facebook/opt-66b": ModelType.PT,
373
+ "facebook/xglm-7.5B": ModelType.PT,
374
+ "facebook/xglm-564M": ModelType.PT,
375
+ "facebook/opt-30b": ModelType.PT,
376
+ "golaxy/gogpt-7b": ModelType.FT,
377
+ "golaxy/gogpt2-7b": ModelType.FT,
378
+ "golaxy/gogpt-7b-bloom": ModelType.FT,
379
+ "golaxy/gogpt-3b-bloom": ModelType.FT,
380
+ "psmathur/orca_mini_v2_7b": ModelType.IFT,
381
+ "psmathur/orca_mini_7b": ModelType.IFT,
382
+ "psmathur/orca_mini_3b": ModelType.IFT,
383
+ "psmathur/orca_mini_v2_13b": ModelType.IFT,
384
+ "gpt2-xl": ModelType.PT,
385
+ "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
386
+ "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
387
+ "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
388
+ "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
389
+ "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
390
+ "jzjiao/opt-1.3b-rlhf": ModelType.FT,
391
+ "HuggingFaceH4/starchat-beta": ModelType.IFT,
392
+ "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
393
+ "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
394
+ "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
395
+ "openchat/openchat_8192": ModelType.IFT,
396
+ "openchat/openchat_v2": ModelType.IFT,
397
+ "openchat/openchat_v2_w": ModelType.IFT,
398
+ "ausboss/llama-13b-supercot": ModelType.IFT,
399
+ "ausboss/llama-30b-supercot": ModelType.IFT,
400
+ "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
401
+ "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
402
+ "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
403
+ "victor123/WizardLM-13B-1.0": ModelType.IFT,
404
+ "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
405
+ "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
406
+ "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
407
+ "baichuan-inc/Baichuan-7B": ModelType.PT,
408
+ "tiiuae/falcon-40b-instruct": ModelType.IFT,
409
+ "tiiuae/falcon-40b": ModelType.PT,
410
+ "tiiuae/falcon-7b": ModelType.PT,
411
+ "YeungNLP/firefly-llama-13b": ModelType.FT,
412
+ "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
413
+ "YeungNLP/firefly-llama2-13b": ModelType.FT,
414
+ "YeungNLP/firefly-ziya-13b": ModelType.FT,
415
+ "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
416
+ "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
417
+ "xzuyn/MedicWizard-7B": ModelType.FT,
418
+ "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
419
+ "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
420
+ "beomi/llama-2-ko-7b": ModelType.IFT,
421
+ "Salesforce/codegen-6B-multi": ModelType.PT,
422
+ "Salesforce/codegen-16B-nl": ModelType.PT,
423
+ "Salesforce/codegen-6B-nl": ModelType.PT,
424
+ "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
425
+ "gpt2-large": ModelType.PT,
426
+ "frank098/orca_mini_3b_juniper": ModelType.FT,
427
+ "frank098/WizardLM_13B_juniper": ModelType.FT,
428
+ "FPHam/Free_Sydney_13b_HF": ModelType.FT,
429
+ "huggingface/llama-13b": ModelType.PT,
430
+ "huggingface/llama-7b": ModelType.PT,
431
+ "huggingface/llama-65b": ModelType.PT,
432
+ "huggingface/llama-30b": ModelType.PT,
433
+ "Henk717/chronoboros-33B": ModelType.IFT,
434
+ "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
435
+ "jondurbin/airoboros-7b": ModelType.IFT,
436
+ "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
437
+ "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
438
+ "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
439
+ "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
440
+ "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
441
+ "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
442
+ "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
443
+ "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
444
+ "jondurbin/airoboros-13b": ModelType.IFT,
445
+ "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
446
+ "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
447
+ "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
448
+ "ariellee/SuperPlatty-30B": ModelType.IFT,
449
+ "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
450
+ "cerebras/Cerebras-GPT-256M": ModelType.PT,
451
+ "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
452
+ "cerebras/Cerebras-GPT-13B": ModelType.PT,
453
+ "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
454
+ "cerebras/Cerebras-GPT-111M": ModelType.PT,
455
+ "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
456
+ "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
457
+ "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
458
+ "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
459
+ "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
460
+ "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
461
+ "NousResearch/Nous-Hermes-13b": ModelType.IFT,
462
+ "project-baize/baize-v2-7b": ModelType.IFT,
463
+ "project-baize/baize-v2-13b": ModelType.IFT,
464
+ "LLMs/WizardLM-13B-V1.0": ModelType.FT,
465
+ "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
466
+ "wenge-research/yayi-7b": ModelType.FT,
467
+ "wenge-research/yayi-7b-llama2": ModelType.FT,
468
+ "wenge-research/yayi-13b-llama2": ModelType.FT,
469
+ "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
470
+ "llama-anon/instruct-13b": ModelType.IFT,
471
+ "huggingtweets/jerma985": ModelType.FT,
472
+ "huggingtweets/gladosystem": ModelType.FT,
473
+ "huggingtweets/bladeecity-jerma985": ModelType.FT,
474
+ "huggyllama/llama-13b": ModelType.PT,
475
+ "huggyllama/llama-65b": ModelType.PT,
476
+ "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
477
+ "upstage/Llama-2-70b-instruct": ModelType.IFT,
478
+ "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
479
+ "upstage/llama-65b-instruct": ModelType.IFT,
480
+ "upstage/llama-30b-instruct-2048": ModelType.IFT,
481
+ "upstage/llama-30b-instruct": ModelType.IFT,
482
+ "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
483
+ "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
484
+ "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
485
+ "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
486
+ "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
487
+ "gpt2": ModelType.PT,
488
+ "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
489
+ "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
490
+ "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
491
+ "quantumaikr/KoreanLM-hf": ModelType.FT,
492
+ "quantumaikr/open_llama_7b_hf": ModelType.FT,
493
+ "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
494
+ "MayaPH/FinOPT-Lincoln": ModelType.FT,
495
+ "MayaPH/FinOPT-Franklin": ModelType.FT,
496
+ "MayaPH/GodziLLa-30B": ModelType.IFT,
497
+ "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
498
+ "MayaPH/FinOPT-Washington": ModelType.FT,
499
+ "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
500
+ "layoric/llama-2-13b-code-alpaca": ModelType.FT,
501
+ "CobraMamba/mamba-gpt-3b": ModelType.FT,
502
+ "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
503
+ "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
504
+ "timdettmers/guanaco-33b-merged": ModelType.FT,
505
+ "elinas/chronos-33b": ModelType.IFT,
506
+ "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
507
+ "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
508
+ "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
509
+ "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
510
+ "meta-llama/Llama-2-7b-hf": ModelType.PT,
511
+ "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
512
+ "meta-llama/Llama-2-13b-hf": ModelType.PT,
513
+ "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
514
+ "meta-llama/Llama-2-70b-hf": ModelType.PT,
515
+ "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
516
+ "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
517
+ "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
518
+ "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
519
+ "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
520
+ "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
521
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
522
+ "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
523
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
524
+ "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
525
+ "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
526
+ "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
527
+ "bofenghuang/vigogne-13b-chat": ModelType.FT,
528
+ "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
529
+ "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
530
+ "bofenghuang/vigogne-7b-chat": ModelType.FT,
531
+ "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
532
+ "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
533
+ "ewof/koishi-instruct-3b": ModelType.IFT,
534
+ "gywy/llama2-13b-chinese-v1": ModelType.FT,
535
+ "GOAT-AI/GOAT-7B-Community": ModelType.FT,
536
+ "psyche/kollama2-7b": ModelType.FT,
537
+ "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
538
+ "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
539
+ "augtoma/qCammel-70-x": ModelType.IFT,
540
+ "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
541
+ "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
542
+ "64bits/LexPodLM-13B": ModelType.FT,
543
+ }
544
+
545
+
546
+ def model_type_from_str(type):
547
+ if "fine-tuned" in type or "🔶" in type:
548
+ return ModelType.FT
549
+ if "pretrained" in type or "🟢" in type:
550
+ return ModelType.PT
551
+ if "RL-tuned" in type or "🟦" in type:
552
+ return ModelType.RL
553
+ if "instruction-tuned" in type or "⭕" in type:
554
+ return ModelType.IFT
555
+ return ModelType.Unknown
src/display_models/modelcard_filter.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import ModelCard
3
+
4
+
5
+ # ht to @Wauplin, thank you for the snippet!
6
+ # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
7
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
8
+ # Returns operation status, and error message
9
+ try:
10
+ card = ModelCard.load(repo_id)
11
+ except huggingface_hub.utils.EntryNotFoundError:
12
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
13
+
14
+ # Enforce license metadata
15
+ if card.data.license is None:
16
+ if not ("license_name" in card.data and "license_link" in card.data):
17
+ return False, (
18
+ "License not found. Please add a license to your model card using the `license` metadata or a"
19
+ " `license_name`/`license_link` pair."
20
+ )
21
+
22
+ # Enforce card content
23
+ if len(card.text) < 200:
24
+ return False, "Please add a description to your model card, it is too short."
25
+
26
+ return True, ""
src/display_models/read_results.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Tuple
5
+
6
+ import dateutil
7
+ import numpy as np
8
+
9
+ from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
+
11
+ METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
+ BENCH_TO_NAME = {
14
+ "arc:challenge": AutoEvalColumn.arc.name,
15
+ "hellaswag": AutoEvalColumn.hellaswag.name,
16
+ "hendrycksTest": AutoEvalColumn.mmlu.name,
17
+ "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
18
+ }
19
+
20
+
21
+ @dataclass
22
+ class EvalResult:
23
+ eval_name: str
24
+ org: str
25
+ model: str
26
+ revision: str
27
+ results: dict
28
+ precision: str = ""
29
+ model_type: str = ""
30
+ weight_type: str = "Original"
31
+ date: str = ""
32
+
33
+ def to_dict(self):
34
+ from src.load_from_hub import is_model_on_hub
35
+
36
+ if self.org is not None:
37
+ base_model = f"{self.org}/{self.model}"
38
+ else:
39
+ base_model = f"{self.model}"
40
+ data_dict = {}
41
+
42
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
43
+ data_dict["weight_type"] = self.weight_type # not a column, just a save name
44
+ data_dict[AutoEvalColumn.precision.name] = self.precision
45
+ data_dict[AutoEvalColumn.model_type.name] = self.model_type
46
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
47
+ data_dict[AutoEvalColumn.dummy.name] = base_model
48
+ data_dict[AutoEvalColumn.revision.name] = self.revision
49
+ data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
50
+ data_dict[AutoEvalColumn.still_on_hub.name] = (
51
+ is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
52
+ )
53
+
54
+ for benchmark in BENCHMARKS:
55
+ if benchmark not in self.results.keys():
56
+ self.results[benchmark] = None
57
+
58
+ for k, v in BENCH_TO_NAME.items():
59
+ data_dict[v] = self.results[k]
60
+
61
+ return data_dict
62
+
63
+
64
+ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
65
+ with open(json_filepath) as fp:
66
+ data = json.load(fp)
67
+
68
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
69
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
70
+ return None, [] # we skip models with the wrong version
71
+
72
+ try:
73
+ config = data["config"]
74
+ except KeyError:
75
+ config = data["config_general"]
76
+ model = config.get("model_name", None)
77
+ if model is None:
78
+ model = config.get("model_args", None)
79
+
80
+ model_sha = config.get("model_sha", "")
81
+ model_split = model.split("/", 1)
82
+
83
+ precision = config.get("model_dtype")
84
+
85
+ model = model_split[-1]
86
+
87
+ if len(model_split) == 1:
88
+ org = None
89
+ model = model_split[0]
90
+ result_key = f"{model}_{precision}"
91
+ else:
92
+ org = model_split[0]
93
+ model = model_split[1]
94
+ result_key = f"{org}_{model}_{precision}"
95
+
96
+ eval_results = []
97
+ for benchmark, metric in zip(BENCHMARKS, METRICS):
98
+ accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
99
+ if accs.size == 0 or any([acc is None for acc in accs]):
100
+ continue
101
+ mean_acc = np.mean(accs) * 100.0
102
+ eval_results.append(
103
+ EvalResult(
104
+ eval_name=result_key,
105
+ org=org,
106
+ model=model,
107
+ revision=model_sha,
108
+ results={benchmark: mean_acc},
109
+ precision=precision, # todo model_type=, weight_type=
110
+ date=config.get("submission_date"),
111
+ )
112
+ )
113
+
114
+ return result_key, eval_results
115
+
116
+
117
+ def get_eval_results() -> List[EvalResult]:
118
+ json_filepaths = []
119
+
120
+ for root, dir, files in os.walk("eval-results"):
121
+ # We should only have json files in model results
122
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
123
+ continue
124
+
125
+ # Sort the files by date
126
+ # store results by precision maybe?
127
+ try:
128
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
129
+ except dateutil.parser._parser.ParserError:
130
+ files = [files[-1]]
131
+
132
+ # up_to_date = files[-1]
133
+ for file in files:
134
+ json_filepaths.append(os.path.join(root, file))
135
+
136
+ eval_results = {}
137
+ for json_filepath in json_filepaths:
138
+ result_key, results = parse_eval_result(json_filepath)
139
+ for eval_result in results:
140
+ if result_key in eval_results.keys():
141
+ eval_results[result_key].results.update(eval_result.results)
142
+ else:
143
+ eval_results[result_key] = eval_result
144
+
145
+ eval_results = [v for v in eval_results.values()]
146
+
147
+ return eval_results
148
+
149
+
150
+ def get_eval_results_dicts() -> List[Dict]:
151
+ eval_results = get_eval_results()
152
+
153
+ return [e.to_dict() for e in eval_results]
src/display_models/utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ API = HfApi()
7
+
8
+
9
+ # These classes are for user facing column names, to avoid having to change them
10
+ # all around the code when a modif is needed
11
+ @dataclass
12
+ class ColumnContent:
13
+ name: str
14
+ type: str
15
+ displayed_by_default: bool
16
+ hidden: bool = False
17
+
18
+
19
+ def fields(raw_class):
20
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class AutoEvalColumn: # Auto evals column
25
+ model_type_symbol = ColumnContent("T", "str", True)
26
+ model = ColumnContent("Model", "markdown", True)
27
+ average = ColumnContent("Average ⬆️", "number", True)
28
+ arc = ColumnContent("ARC", "number", True)
29
+ hellaswag = ColumnContent("HellaSwag", "number", True)
30
+ mmlu = ColumnContent("MMLU", "number", True)
31
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
+ model_type = ColumnContent("Type", "str", False)
33
+ precision = ColumnContent("Precision", "str", False) # , True)
34
+ license = ColumnContent("Hub License", "str", False)
35
+ params = ColumnContent("#Params (B)", "number", False)
36
+ likes = ColumnContent("Hub ❤️", "number", False)
37
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
38
+ revision = ColumnContent("Model sha", "str", False, False)
39
+ dummy = ColumnContent(
40
+ "model_name_for_query", "str", True
41
+ ) # dummy col to implement search bar (hidden by custom CSS)
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class EloEvalColumn: # Elo evals column
46
+ model = ColumnContent("Model", "markdown", True)
47
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
48
+ human_all = ColumnContent("Human (all)", "number", True)
49
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
50
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class EvalQueueColumn: # Queue column
55
+ model = ColumnContent("model", "markdown", True)
56
+ revision = ColumnContent("revision", "str", True)
57
+ private = ColumnContent("private", "bool", True)
58
+ precision = ColumnContent("precision", "str", True)
59
+ weight_type = ColumnContent("weight_type", "str", "Original")
60
+ status = ColumnContent("status", "str", True)
61
+
62
+
63
+ LLAMAS = [
64
+ "huggingface/llama-7b",
65
+ "huggingface/llama-13b",
66
+ "huggingface/llama-30b",
67
+ "huggingface/llama-65b",
68
+ ]
69
+
70
+
71
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
72
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
73
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
74
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
75
+ MODEL_PAGE = "https://huggingface.co/models"
76
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
77
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
78
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
79
+
80
+
81
+ def model_hyperlink(link, model_name):
82
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
83
+
84
+
85
+ def make_clickable_model(model_name):
86
+ link = f"https://huggingface.co/{model_name}"
87
+
88
+ if model_name in LLAMAS:
89
+ link = LLAMA_LINK
90
+ model_name = model_name.split("/")[1]
91
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
92
+ link = VICUNA_LINK
93
+ model_name = "stable-vicuna-13b"
94
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
95
+ link = ALPACA_LINK
96
+ model_name = "alpaca-13b"
97
+ if model_name == "dolly-12b":
98
+ link = DOLLY_LINK
99
+ elif model_name == "vicuna-13b":
100
+ link = VICUNA_LINK
101
+ elif model_name == "koala-13b":
102
+ link = KOALA_LINK
103
+ elif model_name == "oasst-12b":
104
+ link = OASST_LINK
105
+
106
+ details_model_name = model_name.replace("/", "__")
107
+ details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
108
+
109
+ if not bool(os.getenv("DEBUG", "False")):
110
+ # We only add these checks when not debugging, as they are extremely slow
111
+ print(f"details_link: {details_link}")
112
+ try:
113
+ check_path = list(
114
+ API.list_files_info(
115
+ repo_id=f"open-llm-leaderboard/details_{details_model_name}",
116
+ paths="README.md",
117
+ repo_type="dataset",
118
+ )
119
+ )
120
+ print(f"check_path: {check_path}")
121
+ except Exception as err:
122
+ # No details repo for this model
123
+ print(f"No details repo for this model: {err}")
124
+ return model_hyperlink(link, model_name)
125
+
126
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
127
+
128
+
129
+ def styled_error(error):
130
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
131
+
132
+
133
+ def styled_warning(warn):
134
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
135
+
136
+
137
+ def styled_message(message):
138
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
139
+
140
+
141
+ def has_no_nan_values(df, columns):
142
+ return df[columns].notna().all(axis=1)
143
+
144
+
145
+ def has_nan_values(df, columns):
146
+ return df[columns].isna().any(axis=1)
src/envs.py DELETED
@@ -1,46 +0,0 @@
1
- import os
2
- import logging
3
-
4
- from huggingface_hub import HfApi
5
-
6
- # clone / pull the lmeh eval data
7
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
8
-
9
- REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
10
- QUEUE_REPO = "open-llm-leaderboard/requests"
11
- DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
12
- RESULTS_REPO = "open-llm-leaderboard/results"
13
-
14
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
15
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
16
-
17
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
18
-
19
- HF_HOME = os.getenv("HF_HOME", ".")
20
-
21
- # Check HF_HOME write access
22
- print(f"Initial HF_HOME set to: {HF_HOME}")
23
-
24
- if not os.access(HF_HOME, os.W_OK):
25
- print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
26
- HF_HOME = "."
27
- os.environ["HF_HOME"] = HF_HOME
28
- else:
29
- print("Write access confirmed for HF_HOME")
30
-
31
- EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
32
- EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
33
- DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
34
- DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
35
-
36
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
37
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
38
-
39
- PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
40
-
41
- # Rate limit variables
42
- RATE_LIMIT_PERIOD = 7
43
- RATE_LIMIT_QUOTA = 5
44
- HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
45
-
46
- API = HfApi(token=H4_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/filter_models.py DELETED
@@ -1,171 +0,0 @@
1
- from src.display.formatting import model_hyperlink
2
- from src.display.utils import AutoEvalColumn
3
-
4
-
5
- # Models which have been flagged by users as being problematic for a reason or another
6
- # (Model name to forum discussion link)
7
- FLAGGED_MODELS = {
8
- "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
9
- "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
10
- "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
11
- "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
12
- "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
13
- "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
14
- "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
15
- "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
- "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
17
- "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
18
- "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
19
- "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
20
- "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
21
- "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
22
- "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
23
- "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
24
- "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
25
- "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
26
- "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
27
- "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
28
- "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
29
- "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
30
- "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
31
- "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
32
- "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
33
- "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
34
- "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
35
- "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
36
- "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
37
- "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
38
- "cookinai/BruinHermes": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
39
- "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
40
- "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
41
- "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
42
- "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
43
- "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
44
- "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
45
- "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
46
- "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
47
- "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
48
- "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
49
- "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
50
- "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
51
- "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
52
- "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
53
- # Merges not indicated
54
- "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
55
- "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
56
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
57
- "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
58
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
59
- "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
60
- "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
61
- "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
62
- "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
63
- "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
64
- "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
65
- "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
66
- "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
67
- "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
68
- "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
69
- "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
70
- "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
71
- "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
72
- "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
73
- "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
74
- "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
75
- "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
76
- "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
77
- "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
78
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
79
- "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
80
- "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
81
- "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
82
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
83
- "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
84
- "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
85
- "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
86
- "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
87
- "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
88
- "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
89
- "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
90
- "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
91
- "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
92
- "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
93
- "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
94
- "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
95
- "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
96
- "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
97
- "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
98
- "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
99
- "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
100
- "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
101
- "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
102
- "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
103
- "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
104
- "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
105
- # MoErges
106
- "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
107
- "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
108
- "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
109
- "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
110
- "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
111
- "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
112
- "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
113
- "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
114
- "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
115
- "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
116
- "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
117
- # Other - contamination mostly
118
- "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
119
- "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
120
- "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
121
- "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
122
- }
123
-
124
- # Models which have been requested by orgs to not be submitted on the leaderboard
125
- DO_NOT_SUBMIT_MODELS = [
126
- "Voicelab/trurl-2-13b", # trained on MMLU
127
- "TigerResearch/tigerbot-70b-chat", # per authors request
128
- "TigerResearch/tigerbot-70b-chat-v2", # per authors request
129
- "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
130
- ]
131
-
132
-
133
- def flag_models(leaderboard_data: list[dict]):
134
- """Flags models based on external criteria or flagged status."""
135
- for model_data in leaderboard_data:
136
- # Merges and moes are flagged automatically
137
- if model_data[AutoEvalColumn.flagged.name]:
138
- flag_key = "merged"
139
- else:
140
- flag_key = model_data[AutoEvalColumn.fullname.name]
141
- if flag_key in FLAGGED_MODELS:
142
- issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
143
- issue_link = model_hyperlink(
144
- FLAGGED_MODELS[flag_key],
145
- f"See discussion #{issue_num}",
146
- )
147
- model_data[AutoEvalColumn.model.name] = (
148
- f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
149
- )
150
- model_data[AutoEvalColumn.flagged.name] = True
151
- else:
152
- model_data[AutoEvalColumn.flagged.name] = False
153
-
154
-
155
- def remove_forbidden_models(leaderboard_data: list[dict]):
156
- """Removes models from the leaderboard based on the DO_NOT_SUBMIT list."""
157
- indices_to_remove = []
158
- for ix, model in enumerate(leaderboard_data):
159
- if model[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
160
- indices_to_remove.append(ix)
161
-
162
- # Remove the models from the list
163
- for ix in reversed(indices_to_remove):
164
- leaderboard_data.pop(ix)
165
- return leaderboard_data
166
-
167
-
168
- def filter_models_flags(leaderboard_data: list[dict]):
169
- leaderboard_data = remove_forbidden_models(leaderboard_data)
170
- flag_models(leaderboard_data)
171
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,263 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from json import JSONDecodeError
4
- import logging
5
- import math
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Optional, Dict, List
9
-
10
- from tqdm import tqdm
11
- from tqdm.contrib.logging import logging_redirect_tqdm
12
-
13
- import numpy as np
14
-
15
- from src.display.formatting import make_clickable_model
16
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
-
21
- @dataclass
22
- class EvalResult:
23
- # Also see src.display.utils.AutoEvalColumn for what will be displayed.
24
- eval_name: str # org_model_precision (uid)
25
- full_model: str # org/model (path on hub)
26
- org: Optional[str]
27
- model: str
28
- revision: str # commit hash, "" if main
29
- results: Dict[str, float]
30
- precision: Precision = Precision.Unknown
31
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
32
- weight_type: WeightType = WeightType.Original
33
- architecture: str = "Unknown" # From config file
34
- license: str = "?"
35
- likes: int = 0
36
- num_params: int = 0
37
- date: str = "" # submission date of request file
38
- still_on_hub: bool = True
39
- is_merge: bool = False
40
- flagged: bool = False
41
- status: str = "FINISHED"
42
- # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
- tags: List[str] = field(default_factory=list)
44
-
45
-
46
- @classmethod
47
- def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
48
- with open(json_filepath, 'r') as fp:
49
- data = json.load(fp)
50
-
51
- config = data.get("config_general", {})
52
- precision = Precision.from_str(config.get("model_dtype", "unknown"))
53
- org_and_model = config.get("model_name", "").split("/", 1)
54
- org = org_and_model[0] if len(org_and_model) > 1 else None
55
- model = org_and_model[-1]
56
- if len(org_and_model) == 1:
57
- org = None
58
- model = org_and_model[0]
59
- result_key = f"{model}_{precision.value.name}"
60
- else:
61
- org = org_and_model[0]
62
- model = org_and_model[1]
63
- result_key = f"{org}_{model}_{precision.value.name}"
64
- full_model = "/".join(org_and_model)
65
-
66
- results = cls.extract_results(data) # Properly call the method to extract results
67
-
68
- return cls(
69
- eval_name=result_key,
70
- full_model=full_model,
71
- org=org,
72
- model=model,
73
- results=results,
74
- precision=precision,
75
- revision=config.get("model_sha", "")
76
- )
77
-
78
- @staticmethod
79
- def extract_results(data: Dict) -> Dict[str, float]:
80
- """
81
- Extract and process benchmark results from a given dict.
82
-
83
- Parameters:
84
- - data (Dict): A dictionary containing benchmark data. This dictionary must
85
- include 'versions' and 'results' keys with respective sub-data.
86
-
87
- Returns:
88
- - Dict[str, float]: A dictionary where keys are benchmark names and values
89
- are the processed average scores as percentages.
90
-
91
- Notes:
92
- - The method specifically checks for certain benchmark names to skip outdated entries.
93
- - Handles NaN values by setting the corresponding benchmark result to 0.0.
94
- - Averages scores across metrics for benchmarks found in the data, in a percentage format.
95
- """
96
- results = {}
97
- for task in Tasks:
98
- task = task.value
99
- # We skip old mmlu entries
100
- if task.benchmark == "hendrycksTest":
101
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
102
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
103
- continue
104
-
105
- # Some benchamrk values are NaNs, mostly truthfulQA
106
- # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
107
- # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
108
- for k, v in data["results"].items():
109
- if task.benchmark in k:
110
- if math.isnan(float(v[task.metric])):
111
- results[task.benchmark] = 0.0
112
- continue
113
-
114
- # We average all scores of a given metric (mostly for mmlu)
115
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
116
- if accs.size == 0 or any([acc is None for acc in accs]):
117
- continue
118
-
119
- mean_acc = np.mean(accs) * 100.0
120
- results[task.benchmark] = mean_acc
121
-
122
- return results
123
-
124
-
125
- def update_with_request_file(self, requests_path):
126
- """Finds the relevant request file for the current model and updates info with it."""
127
- try:
128
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
129
- if request_file is None:
130
- logging.warning(f"No request file for {self.org}/{self.model}")
131
- self.status = "FAILED"
132
- return
133
-
134
- with open(request_file, "r") as f:
135
- request = json.load(f)
136
-
137
- self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
138
- self.weight_type = WeightType[request.get("weight_type", "Original")]
139
- self.num_params = int(request.get("params", 0)) # Ensuring type safety
140
- self.date = request.get("submitted_time", "")
141
- self.architecture = request.get("architectures", "Unknown")
142
- self.status = request.get("status", "FAILED")
143
-
144
- except FileNotFoundError:
145
- self.status = "FAILED"
146
- logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
147
- except JSONDecodeError:
148
- self.status = "FAILED"
149
- logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
150
- except KeyError as e:
151
- self.status = "FAILED"
152
- logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
153
- except Exception as e: # Catch-all for any other unexpected exceptions
154
- self.status = "FAILED"
155
- logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
156
-
157
-
158
- def update_with_dynamic_file_dict(self, file_dict):
159
- """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
160
- # Default values set for optional or potentially missing keys.
161
- self.license = file_dict.get("license", "?")
162
- self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
163
- self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
164
- self.tags = file_dict.get("tags", [])
165
-
166
- # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
- self.flagged = "flagged" in self.tags
168
-
169
-
170
- def to_dict(self):
171
- """Converts the Eval Result to a dict compatible with our dataframe display"""
172
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
173
- data_dict = {
174
- "eval_name": self.eval_name, # not a column, just a save name,
175
- AutoEvalColumn.precision.name: self.precision.value.name,
176
- AutoEvalColumn.model_type.name: self.model_type.value.name,
177
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
178
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
179
- AutoEvalColumn.architecture.name: self.architecture,
180
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
181
- AutoEvalColumn.fullname.name: self.full_model,
182
- AutoEvalColumn.revision.name: self.revision,
183
- AutoEvalColumn.average.name: average,
184
- AutoEvalColumn.license.name: self.license,
185
- AutoEvalColumn.likes.name: self.likes,
186
- AutoEvalColumn.params.name: self.num_params,
187
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
- AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
189
- AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
190
- AutoEvalColumn.flagged.name: self.flagged,
191
- }
192
-
193
- for task in Tasks:
194
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
195
-
196
- return data_dict
197
-
198
-
199
- def get_request_file_for_model(requests_path, model_name, precision):
200
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
201
- requests_path = Path(requests_path)
202
- pattern = f"{model_name}_eval_request_*.json"
203
-
204
- # Using pathlib to find files matching the pattern
205
- request_files = list(requests_path.glob(pattern))
206
-
207
- # Sort the files by name in descending order to mimic 'reverse=True'
208
- request_files.sort(reverse=True)
209
-
210
- # Select the correct request file based on 'status' and 'precision'
211
- request_file = None
212
- for request_file in request_files:
213
- with request_file.open("r") as f:
214
- req_content = json.load(f)
215
- if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
216
- request_file = str(request_file)
217
-
218
- # Return empty string if no file found that matches criteria
219
- return request_file
220
-
221
-
222
- def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
223
- """From the path of the results folder root, extract all needed info for results"""
224
- with open(dynamic_path) as f:
225
- dynamic_data = json.load(f)
226
-
227
- results_path = Path(results_path)
228
- model_files = list(results_path.rglob('results_*.json'))
229
- model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
230
-
231
- eval_results = {}
232
- # Wrap model_files iteration with tqdm for progress display
233
- for model_result_filepath in tqdm(model_files, desc="Processing model files"):
234
- # Creation of result
235
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
236
- with logging_redirect_tqdm():
237
- eval_result.update_with_request_file(requests_path)
238
-
239
- if eval_result.full_model in dynamic_data:
240
- eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
241
- # Hardcoding because of gating problem
242
- if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
243
- eval_result.still_on_hub = True
244
-
245
- # Store results of same eval together
246
- eval_name = eval_result.eval_name
247
- if eval_name in eval_results.keys():
248
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
249
- else:
250
- eval_results[eval_name] = eval_result
251
-
252
- results = []
253
- for k, v in eval_results.items():
254
- try:
255
- if v.status == "FINISHED":
256
- v.to_dict() # we test if the dict version is complete
257
- results.append(v)
258
- except KeyError as e:
259
- logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
260
- continue
261
-
262
- return results
263
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/load_from_hub.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from huggingface_hub import Repository
6
+ from transformers import AutoConfig
7
+ from collections import defaultdict
8
+
9
+ from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
10
+ from src.display_models.get_model_metadata import apply_metadata
11
+ from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
12
+ from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
13
+
14
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
+
16
+
17
+ def get_all_requested_models(requested_models_dir: str) -> set[str]:
18
+ depth = 1
19
+ file_names = []
20
+ users_to_submission_dates = defaultdict(list)
21
+
22
+ for root, _, files in os.walk(requested_models_dir):
23
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
24
+ if current_depth == depth:
25
+ for file in files:
26
+ if not file.endswith(".json"):
27
+ continue
28
+ with open(os.path.join(root, file), "r") as f:
29
+ info = json.load(f)
30
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
31
+
32
+ # Select organisation
33
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
34
+ continue
35
+ organisation, _ = info["model"].split("/")
36
+ users_to_submission_dates[organisation].append(info["submitted_time"])
37
+
38
+ return set(file_names), users_to_submission_dates
39
+
40
+
41
+ def load_all_info_from_hub(QUEUE_REPO: str, RESULTS_REPO: str, QUEUE_PATH: str, RESULTS_PATH: str) -> list[Repository]:
42
+ eval_queue_repo = None
43
+ eval_results_repo = None
44
+ requested_models = None
45
+
46
+ print("Pulling evaluation requests and results.")
47
+
48
+ eval_queue_repo = Repository(
49
+ local_dir=QUEUE_PATH,
50
+ clone_from=QUEUE_REPO,
51
+ repo_type="dataset",
52
+ )
53
+ eval_queue_repo.git_pull()
54
+
55
+ eval_results_repo = Repository(
56
+ local_dir=RESULTS_PATH,
57
+ clone_from=RESULTS_REPO,
58
+ repo_type="dataset",
59
+ )
60
+ eval_results_repo.git_pull()
61
+
62
+ requested_models, users_to_submission_dates = get_all_requested_models("eval-queue")
63
+
64
+ return eval_queue_repo, requested_models, eval_results_repo, users_to_submission_dates
65
+
66
+
67
+ def get_leaderboard_df(
68
+ eval_results: Repository, eval_results_private: Repository, cols: list, benchmark_cols: list
69
+ ) -> pd.DataFrame:
70
+ if eval_results:
71
+ print("Pulling evaluation results for the leaderboard.")
72
+ eval_results.git_pull()
73
+ if eval_results_private:
74
+ print("Pulling evaluation results for the leaderboard.")
75
+ eval_results_private.git_pull()
76
+
77
+ all_data = get_eval_results_dicts()
78
+
79
+ if not IS_PUBLIC:
80
+ all_data.append(gpt4_values)
81
+ all_data.append(gpt35_values)
82
+
83
+ all_data.append(baseline)
84
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
85
+
86
+ df = pd.DataFrame.from_records(all_data)
87
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
88
+ df = df[cols].round(decimals=2)
89
+
90
+ # filter out if any of the benchmarks have not been produced
91
+ df = df[has_no_nan_values(df, benchmark_cols)]
92
+ return df
93
+
94
+
95
+ def get_evaluation_queue_df(
96
+ eval_queue: Repository, eval_queue_private: Repository, save_path: str, cols: list
97
+ ) -> list[pd.DataFrame]:
98
+ if eval_queue:
99
+ print("Pulling changes for the evaluation queue.")
100
+ eval_queue.git_pull()
101
+ if eval_queue_private:
102
+ print("Pulling changes for the evaluation queue.")
103
+ eval_queue_private.git_pull()
104
+
105
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
106
+ all_evals = []
107
+
108
+ for entry in entries:
109
+ if ".json" in entry:
110
+ file_path = os.path.join(save_path, entry)
111
+ with open(file_path) as fp:
112
+ data = json.load(fp)
113
+
114
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
115
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
116
+
117
+ all_evals.append(data)
118
+ elif ".md" not in entry:
119
+ # this is a folder
120
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
121
+ for sub_entry in sub_entries:
122
+ file_path = os.path.join(save_path, entry, sub_entry)
123
+ with open(file_path) as fp:
124
+ data = json.load(fp)
125
+
126
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
127
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
128
+ all_evals.append(data)
129
+
130
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
131
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
132
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
133
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
134
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
135
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
136
+ return df_finished[cols], df_running[cols], df_pending[cols]
137
+
138
+
139
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
140
+ try:
141
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
142
+ return True, None
143
+
144
+ except ValueError:
145
+ return (
146
+ False,
147
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
148
+ )
149
+
150
+ except Exception as e:
151
+ print(f"Could not get the model config from the hub.: {e}")
152
+ return False, "was not found on hub!"
src/populate.py DELETED
@@ -1,54 +0,0 @@
1
- import json
2
- import os
3
- import pathlib
4
- import pandas as pd
5
- from src.display.formatting import has_no_nan_values, make_clickable_model
6
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
7
- from src.leaderboard.filter_models import filter_models_flags
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
- from src.display.utils import load_json_data
10
-
11
-
12
- def _process_model_data(entry, model_name_key="model", revision_key="revision"):
13
- """Enrich model data with clickable links and revisions."""
14
- entry[EvalQueueColumn.model.name] = make_clickable_model(entry.get(model_name_key, ""))
15
- entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
16
- return entry
17
-
18
-
19
- def get_evaluation_queue_df(save_path, cols):
20
- """Generate dataframes for pending, running, and finished evaluation entries."""
21
- save_path = pathlib.Path(save_path)
22
- all_evals = []
23
-
24
- for path in save_path.rglob('*.json'):
25
- data = load_json_data(path)
26
- if data:
27
- all_evals.append(_process_model_data(data))
28
-
29
- # Organizing data by status
30
- status_map = {
31
- "PENDING": ["PENDING", "RERUN"],
32
- "RUNNING": ["RUNNING"],
33
- "FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
34
- }
35
- status_dfs = {status: [] for status in status_map}
36
- for eval_data in all_evals:
37
- for status, extra_statuses in status_map.items():
38
- if eval_data["status"] in extra_statuses:
39
- status_dfs[status].append(eval_data)
40
-
41
- return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
42
-
43
-
44
- def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
45
- """Retrieve and process leaderboard data."""
46
- raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
47
- all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
48
- filter_models_flags(all_data_json)
49
-
50
- df = pd.DataFrame.from_records(all_data_json)
51
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
52
- df = df[cols].round(decimals=2)
53
- df = df[has_no_nan_values(df, benchmark_cols)]
54
- return raw_data, df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rate_limiting.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timezone, timedelta
2
+
3
+
4
+ def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
5
+ org_or_user, _ = submission_name.split("/")
6
+ if org_or_user not in users_to_submission_dates:
7
+ return 0
8
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
9
+
10
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
11
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
12
+
13
+ return len(submissions_after_timelimit)
src/scripts/create_request_file.py DELETED
@@ -1,92 +0,0 @@
1
- import json
2
- import os
3
- import pprint
4
- from datetime import datetime, timezone
5
-
6
- import click
7
- from colorama import Fore
8
- from huggingface_hub import HfApi, snapshot_download
9
-
10
- from src.display.utils import ModelType, WeightType
11
- from src.submission.check_validity import get_model_size
12
-
13
- EVAL_REQUESTS_PATH = "eval-queue"
14
- QUEUE_REPO = "open-llm-leaderboard/requests"
15
-
16
- precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
17
- model_types = [e.name for e in ModelType]
18
- weight_types = [e.name for e in WeightType]
19
-
20
-
21
- def main():
22
- api = HfApi()
23
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
24
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
25
-
26
- model_name = click.prompt("Enter model name")
27
- revision = click.prompt("Enter revision", default="main")
28
- precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
29
- model_type = click.prompt("Enter model type", type=click.Choice(model_types))
30
- weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
31
- base_model = click.prompt("Enter base model", default="")
32
- status = click.prompt("Enter status", default="FINISHED")
33
-
34
- try:
35
- model_info = api.model_info(repo_id=model_name, revision=revision)
36
- except Exception as e:
37
- print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
38
- return 1
39
-
40
- model_size = get_model_size(model_info=model_info, precision=precision)
41
-
42
- try:
43
- license = model_info.cardData["license"]
44
- except Exception:
45
- license = "?"
46
-
47
- eval_entry = {
48
- "model": model_name,
49
- "base_model": base_model,
50
- "revision": revision,
51
- "private": False,
52
- "precision": precision,
53
- "weight_type": weight_type,
54
- "status": status,
55
- "submitted_time": current_time,
56
- "model_type": model_type,
57
- "likes": model_info.likes,
58
- "params": model_size,
59
- "license": license,
60
- }
61
-
62
- user_name = ""
63
- model_path = model_name
64
- if "/" in model_name:
65
- user_name = model_name.split("/")[0]
66
- model_path = model_name.split("/")[1]
67
-
68
- pprint.pprint(eval_entry)
69
-
70
- if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
71
- click.echo("continuing...")
72
-
73
- out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
74
- os.makedirs(out_dir, exist_ok=True)
75
- out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
76
-
77
- with open(out_path, "w") as f:
78
- f.write(json.dumps(eval_entry))
79
-
80
- api.upload_file(
81
- path_or_fileobj=out_path,
82
- path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
83
- repo_id=QUEUE_REPO,
84
- repo_type="dataset",
85
- commit_message=f"Add {model_name} to eval queue",
86
- )
87
- else:
88
- click.echo("aborting...")
89
-
90
-
91
- if __name__ == "__main__":
92
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/scripts/update_all_request_files.py DELETED
@@ -1,129 +0,0 @@
1
- import json
2
- import os
3
- import time
4
-
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
8
- from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
9
-
10
-
11
- def update_one_model(model_id, data, models_on_the_hub):
12
- # Model no longer on the hub at all
13
- if model_id not in models_on_the_hub:
14
- data["still_on_hub"] = False
15
- data["likes"] = 0
16
- data["downloads"] = 0
17
- data["created_at"] = ""
18
- data["tags"] = []
19
- return data
20
-
21
- # Grabbing model parameters
22
- model_cfg = models_on_the_hub[model_id]
23
- data["likes"] = model_cfg.likes
24
- data["downloads"] = model_cfg.downloads
25
- data["created_at"] = str(model_cfg.created_at)
26
- data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
27
-
28
- # Grabbing model details
29
- model_name = model_id
30
- if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
31
- if isinstance(model_cfg.card_data.base_model, str):
32
- model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
33
- still_on_hub, _, _ = is_model_on_hub(
34
- model_name=model_name,
35
- revision=data.get("revision"),
36
- trust_remote_code=True,
37
- test_tokenizer=False,
38
- token=H4_TOKEN,
39
- )
40
- # If the model doesn't have a model card or a license, we consider it's deleted
41
- if still_on_hub:
42
- try:
43
- status, _, model_card = check_model_card(model_id)
44
- if status is False:
45
- still_on_hub = False
46
- except Exception:
47
- model_card = None
48
- still_on_hub = False
49
- data["still_on_hub"] = still_on_hub
50
-
51
- tags = get_model_tags(model_card, model_id) if still_on_hub else []
52
-
53
- data["tags"] = tags
54
- return data
55
-
56
-
57
- def update_models(file_path, models_on_the_hub):
58
- """
59
- Search through all JSON files in the specified root folder and its subfolders,
60
- and update the likes key in JSON dict from value of input dict
61
- """
62
- seen_models = []
63
- with open(file_path, "r") as f:
64
- model_infos = json.load(f)
65
- for model_id in model_infos.keys():
66
- seen_models.append(model_id)
67
- model_infos[model_id] = update_one_model(
68
- model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
69
- )
70
-
71
- # If new requests files have been created since we started all this
72
- # we grab them
73
- all_models = []
74
- try:
75
- for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
76
- if ix == 0:
77
- continue
78
- for file in files:
79
- if "eval_request" in file:
80
- path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
81
- all_models.append(path)
82
- except Exception as e:
83
- print(e)
84
- pass
85
-
86
- for model_id in all_models:
87
- if model_id not in seen_models:
88
- model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
89
-
90
- with open(file_path, "w") as f:
91
- json.dump(model_infos, f, indent=2)
92
-
93
-
94
- def update_dynamic_files():
95
- """This will only update metadata for models already linked in the repo, not add missing ones."""
96
- snapshot_download(
97
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
98
- )
99
-
100
- print("UPDATE_DYNAMIC: Loaded snapshot")
101
- # Get models
102
- start = time.time()
103
-
104
- models = list(
105
- API.list_models(
106
- # filter=ModelFilter(task="text-generation"),
107
- full=False,
108
- cardData=True,
109
- fetch_config=True,
110
- )
111
- )
112
- id_to_model = {model.id: model for model in models}
113
-
114
- print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
115
-
116
- start = time.time()
117
-
118
- update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
119
-
120
- print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
121
-
122
- API.upload_file(
123
- path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
124
- path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
125
- repo_id=DYNAMIC_INFO_REPO,
126
- repo_type="dataset",
127
- commit_message="Daily request file update.",
128
- )
129
- print("UPDATE_DYNAMIC: pushed to hub")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,179 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
10
- from transformers import AutoConfig, AutoTokenizer
11
-
12
- from src.envs import HAS_HIGHER_RATE_LIMIT
13
-
14
-
15
- # ht to @Wauplin, thank you for the snippet!
16
- # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
17
- def check_model_card(repo_id: str) -> tuple[bool, str]:
18
- # Returns operation status, and error message
19
- try:
20
- card = ModelCard.load(repo_id)
21
- except huggingface_hub.utils.EntryNotFoundError:
22
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
23
-
24
- # Enforce license metadata
25
- if card.data.license is None:
26
- if not ("license_name" in card.data and "license_link" in card.data):
27
- return (
28
- False,
29
- (
30
- "License not found. Please add a license to your model card using the `license` metadata or a"
31
- " `license_name`/`license_link` pair."
32
- ),
33
- None,
34
- )
35
-
36
- # Enforce card content
37
- if len(card.text) < 200:
38
- return False, "Please add a description to your model card, it is too short.", None
39
-
40
- return True, "", card
41
-
42
-
43
- def is_model_on_hub(
44
- model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
45
- ) -> tuple[bool, str, AutoConfig]:
46
- try:
47
- config = AutoConfig.from_pretrained(
48
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
49
- ) # , force_download=True)
50
- if test_tokenizer:
51
- try:
52
- tk = AutoTokenizer.from_pretrained(
53
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
54
- )
55
- except ValueError as e:
56
- return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
57
- except Exception:
58
- return (
59
- False,
60
- "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
61
- None,
62
- )
63
- return True, None, config
64
-
65
- except ValueError:
66
- return (
67
- False,
68
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
69
- None,
70
- )
71
-
72
- except Exception as e:
73
- if "You are trying to access a gated repo." in str(e):
74
- return True, "uses a gated model.", None
75
- return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
76
-
77
-
78
- def get_model_size(model_info: ModelInfo, precision: str):
79
- size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
80
- safetensors = None
81
- try:
82
- safetensors = get_safetensors_metadata(model_info.id)
83
- except Exception as e:
84
- print(e)
85
-
86
- if safetensors is not None:
87
- model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
88
- else:
89
- try:
90
- size_match = re.search(size_pattern, model_info.id.lower())
91
- model_size = size_match.group(0)
92
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
93
- except AttributeError:
94
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
95
-
96
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
97
- model_size = size_factor * model_size
98
- return model_size
99
-
100
-
101
- def get_model_arch(model_info: ModelInfo):
102
- return model_info.config.get("architectures", "Unknown")
103
-
104
-
105
- def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
106
- if org_or_user not in users_to_submission_dates:
107
- return True, ""
108
- submission_dates = sorted(users_to_submission_dates[org_or_user])
109
-
110
- time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
111
- submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
112
-
113
- num_models_submitted_in_period = len(submissions_after_timelimit)
114
- if org_or_user in HAS_HIGHER_RATE_LIMIT:
115
- rate_limit_quota = 2 * rate_limit_quota
116
-
117
- if num_models_submitted_in_period > rate_limit_quota:
118
- error_msg = f"Organisation or user `{org_or_user}`"
119
- error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
120
- error_msg += f"in the last {rate_limit_period} days.\n"
121
- error_msg += (
122
- "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
123
- )
124
- return False, error_msg
125
- return True, ""
126
-
127
-
128
- def already_submitted_models(requested_models_dir: str) -> set[str]:
129
- depth = 1
130
- file_names = []
131
- users_to_submission_dates = defaultdict(list)
132
-
133
- for root, _, files in os.walk(requested_models_dir):
134
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
135
- if current_depth == depth:
136
- for file in files:
137
- if not file.endswith(".json"):
138
- continue
139
- with open(os.path.join(root, file), "r") as f:
140
- info = json.load(f)
141
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
142
-
143
- # Select organisation
144
- if info["model"].count("/") == 0 or "submitted_time" not in info:
145
- continue
146
- organisation, _ = info["model"].split("/")
147
- users_to_submission_dates[organisation].append(info["submitted_time"])
148
-
149
- return set(file_names), users_to_submission_dates
150
-
151
-
152
- def get_model_tags(model_card, model: str):
153
- is_merge_from_metadata = False
154
- is_moe_from_metadata = False
155
-
156
- tags = []
157
- if model_card is None:
158
- return tags
159
- if model_card.data.tags:
160
- is_merge_from_metadata = any(
161
- [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
162
- )
163
- is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
164
-
165
- is_merge_from_model_card = any(
166
- keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
167
- )
168
- if is_merge_from_model_card or is_merge_from_metadata:
169
- tags.append("merge")
170
- is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
171
- # Hardcoding because of gating problem
172
- if "Qwen/Qwen1.5-32B" in model:
173
- print("HERE NSHJNKJSNJLAS")
174
- is_moe_from_model_card = False
175
- is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
176
- if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
177
- tags.append("moe")
178
-
179
- return tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,196 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.display.formatting import styled_error, styled_message, styled_warning
8
- from src.envs import (
9
- API,
10
- DYNAMIC_INFO_FILE_PATH,
11
- DYNAMIC_INFO_PATH,
12
- DYNAMIC_INFO_REPO,
13
- EVAL_REQUESTS_PATH,
14
- H4_TOKEN,
15
- QUEUE_REPO,
16
- RATE_LIMIT_PERIOD,
17
- RATE_LIMIT_QUOTA,
18
- )
19
- from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
20
- from src.submission.check_validity import (
21
- already_submitted_models,
22
- check_model_card,
23
- get_model_size,
24
- get_model_tags,
25
- is_model_on_hub,
26
- user_submission_permission,
27
- )
28
-
29
- REQUESTED_MODELS = None
30
- USERS_TO_SUBMISSION_DATES = None
31
-
32
-
33
- def add_new_eval(
34
- model: str,
35
- base_model: str,
36
- revision: str,
37
- precision: str,
38
- private: bool,
39
- weight_type: str,
40
- model_type: str,
41
- ):
42
- global REQUESTED_MODELS
43
- global USERS_TO_SUBMISSION_DATES
44
- if not REQUESTED_MODELS:
45
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
46
-
47
- user_name = ""
48
- model_path = model
49
- if "/" in model:
50
- user_name = model.split("/")[0]
51
- model_path = model.split("/")[1]
52
-
53
- precision = precision.split(" ")[0]
54
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
55
-
56
- if model_type is None or model_type == "":
57
- return styled_error("Please select a model type.")
58
-
59
- # Is the user rate limited?
60
- if user_name != "":
61
- user_can_submit, error_msg = user_submission_permission(
62
- user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
63
- )
64
- if not user_can_submit:
65
- return styled_error(error_msg)
66
-
67
- # Did the model authors forbid its submission to the leaderboard?
68
- if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
69
- return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
70
-
71
- if model == "CohereForAI/c4ai-command-r-plus":
72
- return styled_warning(
73
- "This model cannot be submitted manually on the leaderboard before the transformers release."
74
- )
75
-
76
- # Does the model actually exist?
77
- if revision == "":
78
- revision = "main"
79
-
80
- # Is the model on the hub?
81
- if weight_type in ["Delta", "Adapter"]:
82
- base_model_on_hub, error, _ = is_model_on_hub(
83
- model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
84
- )
85
- if not base_model_on_hub:
86
- return styled_error(f'Base model "{base_model}" {error}')
87
-
88
- architecture = "?"
89
- downloads = 0
90
- created_at = ""
91
- if not weight_type == "Adapter":
92
- model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
93
- if not model_on_hub or model_config is None:
94
- return styled_error(f'Model "{model}" {error}')
95
- if model_config is not None:
96
- architectures = getattr(model_config, "architectures", None)
97
- if architectures:
98
- architecture = ";".join(architectures)
99
- downloads = getattr(model_config, "downloads", 0)
100
- created_at = getattr(model_config, "created_at", "")
101
-
102
- # Is the model info correctly filled?
103
- try:
104
- model_info = API.model_info(repo_id=model, revision=revision)
105
- except Exception:
106
- return styled_error("Could not get your model information. Please fill it up properly.")
107
-
108
- model_size = get_model_size(model_info=model_info, precision=precision)
109
-
110
- # Were the model card and license filled?
111
- try:
112
- license = model_info.cardData["license"]
113
- except Exception:
114
- return styled_error("Please select a license for your model")
115
-
116
- modelcard_OK, error_msg, model_card = check_model_card(model)
117
- if not modelcard_OK:
118
- return styled_error(error_msg)
119
-
120
- tags = get_model_tags(model_card, model)
121
-
122
- # Seems good, creating the eval
123
- print("Adding new eval")
124
-
125
- eval_entry = {
126
- "model": model,
127
- "base_model": base_model,
128
- "revision": revision,
129
- "private": private,
130
- "precision": precision,
131
- "params": model_size,
132
- "architectures": architecture,
133
- "weight_type": weight_type,
134
- "status": "PENDING",
135
- "submitted_time": current_time,
136
- "model_type": model_type,
137
- "job_id": -1,
138
- "job_start_time": None,
139
- }
140
-
141
- supplementary_info = {
142
- "likes": model_info.likes,
143
- "license": license,
144
- "still_on_hub": True,
145
- "tags": tags,
146
- "downloads": downloads,
147
- "created_at": created_at,
148
- }
149
-
150
- # Check for duplicate submission
151
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
152
- return styled_warning("This model has been already submitted.")
153
-
154
- print("Creating eval file")
155
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
156
- os.makedirs(OUT_DIR, exist_ok=True)
157
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
158
-
159
- with open(out_path, "w") as f:
160
- f.write(json.dumps(eval_entry))
161
-
162
- print("Uploading eval file")
163
- API.upload_file(
164
- path_or_fileobj=out_path,
165
- path_in_repo=out_path.split("eval-queue/")[1],
166
- repo_id=QUEUE_REPO,
167
- repo_type="dataset",
168
- commit_message=f"Add {model} to eval queue",
169
- )
170
-
171
- # We want to grab the latest version of the submission file to not accidentally overwrite it
172
- snapshot_download(
173
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
174
- )
175
-
176
- with open(DYNAMIC_INFO_FILE_PATH) as f:
177
- all_supplementary_info = json.load(f)
178
-
179
- all_supplementary_info[model] = supplementary_info
180
- with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
181
- json.dump(all_supplementary_info, f, indent=2)
182
-
183
- API.upload_file(
184
- path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
185
- path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
186
- repo_id=DYNAMIC_INFO_REPO,
187
- repo_type="dataset",
188
- commit_message=f"Add {model} to dynamic info queue",
189
- )
190
-
191
- # Remove the local file
192
- os.remove(out_path)
193
-
194
- return styled_message(
195
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
196
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tools/collections.py DELETED
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
3
- from huggingface_hub.utils._errors import HfHubHTTPError
4
- from pandas import DataFrame
5
-
6
- from src.display.utils import AutoEvalColumn, ModelType
7
- from src.envs import H4_TOKEN, PATH_TO_COLLECTION
8
-
9
- # Specific intervals for the collections
10
- intervals = {
11
- "1B": pd.Interval(0, 1.5, closed="right"),
12
- "3B": pd.Interval(2.5, 3.5, closed="neither"),
13
- "7B": pd.Interval(6, 8, closed="neither"),
14
- "13B": pd.Interval(10, 14, closed="neither"),
15
- "30B": pd.Interval(25, 35, closed="neither"),
16
- "65B": pd.Interval(60, 70, closed="neither"),
17
- }
18
-
19
-
20
- def _filter_by_type_and_size(df, model_type, size_interval):
21
- """Filter DataFrame by model type and parameter size interval."""
22
- type_emoji = model_type.value.symbol[0]
23
- filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
24
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
25
- mask = params_column.apply(lambda x: x in size_interval)
26
- return filtered_df.loc[mask]
27
-
28
-
29
- def _add_models_to_collection(collection, models, model_type, size):
30
- """Add best models to the collection and update positions."""
31
- cur_len_collection = len(collection.items)
32
- for ix, model in enumerate(models, start=1):
33
- try:
34
- collection = add_collection_item(
35
- PATH_TO_COLLECTION,
36
- item_id=model,
37
- item_type="model",
38
- exists_ok=True,
39
- note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
40
- token=H4_TOKEN,
41
- )
42
- # Ensure position is correct if item was added
43
- if len(collection.items) > cur_len_collection:
44
- item_object_id = collection.items[-1].item_object_id
45
- update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
46
- cur_len_collection = len(collection.items)
47
- break # assuming we only add the top model
48
- except HfHubHTTPError:
49
- continue
50
-
51
-
52
- def update_collections(df: DataFrame):
53
- """Update collections by filtering and adding the best models."""
54
- collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
55
- cur_best_models = []
56
-
57
- for model_type in ModelType:
58
- if not model_type.value.name:
59
- continue
60
- for size, interval in intervals.items():
61
- filtered_df = _filter_by_type_and_size(df, model_type, interval)
62
- best_models = list(
63
- filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
64
- )
65
- print(model_type.value.symbol, size, best_models)
66
- _add_models_to_collection(collection, best_models, model_type, size)
67
- cur_best_models.extend(best_models)
68
-
69
- # Cleanup
70
- existing_models = {item.item_id for item in collection.items}
71
- to_remove = existing_models - set(cur_best_models)
72
- for item_id in to_remove:
73
- try:
74
- delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
- except HfHubHTTPError:
76
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tools/plots.py DELETED
@@ -1,159 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import plotly.express as px
4
- from plotly.graph_objs import Figure
5
-
6
- from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
- from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
- from src.leaderboard.filter_models import FLAGGED_MODELS
9
- from src.leaderboard.read_evals import EvalResult
10
-
11
-
12
- def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
- """
14
- Generates a DataFrame containing the maximum scores until each date.
15
-
16
- :param results_df: A DataFrame containing result information including metric scores and dates.
17
- :return: A new DataFrame containing the maximum scores until each date for every metric.
18
- """
19
- # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
- results_df = pd.DataFrame(raw_data)
21
- # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
- results_df.sort_values(by="date", inplace=True)
23
-
24
- # Step 2: Initialize the scores dictionary
25
- scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
26
-
27
- # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
28
- for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
29
- current_max = 0
30
- last_date = ""
31
- column = task.col_name
32
- for _, row in results_df.iterrows():
33
- current_model = row["full_model"]
34
- # We ignore models that are flagged/no longer on the hub/not finished
35
- to_ignore = (
36
- not row["still_on_hub"]
37
- or row["flagged"]
38
- or current_model in FLAGGED_MODELS
39
- or row["status"] != "FINISHED"
40
- )
41
- if to_ignore:
42
- continue
43
-
44
- current_date = row["date"]
45
- if task.benchmark == "Average":
46
- current_score = np.mean(list(row["results"].values()))
47
- else:
48
- current_score = row["results"][task.benchmark]
49
-
50
- if current_score > current_max:
51
- if current_date == last_date and len(scores[column]) > 0:
52
- scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
53
- else:
54
- scores[column].append({"model": current_model, "date": current_date, "score": current_score})
55
- current_max = current_score
56
- last_date = current_date
57
-
58
- # Step 4: Return all dictionaries as DataFrames
59
- return {k: pd.DataFrame(v) for k, v in scores.items()}
60
-
61
-
62
- def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
63
- """
64
- Transforms the scores DataFrame into a new format suitable for plotting.
65
-
66
- :param scores_df: A DataFrame containing metric scores and dates.
67
- :return: A new DataFrame reshaped for plotting purposes.
68
- """
69
- # Initialize the list to store DataFrames
70
- dfs = []
71
-
72
- # Iterate over the cols and create a new DataFrame for each column
73
- for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
74
- d = scores_df[col].reset_index(drop=True)
75
- d["task"] = col
76
- dfs.append(d)
77
-
78
- # Concatenate all the created DataFrames
79
- concat_df = pd.concat(dfs, ignore_index=True)
80
-
81
- # Sort values by 'date'
82
- concat_df.sort_values(by="date", inplace=True)
83
- concat_df.reset_index(drop=True, inplace=True)
84
- return concat_df
85
-
86
-
87
- def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
88
- """
89
- Create a Plotly figure object with lines representing different metrics
90
- and horizontal dotted lines representing human baselines.
91
-
92
- :param df: The DataFrame containing the metric values, names, and dates.
93
- :param metrics: A list of strings representing the names of the metrics
94
- to be included in the plot.
95
- :param title: A string representing the title of the plot.
96
- :return: A Plotly figure object with lines representing metrics and
97
- horizontal dotted lines representing human baselines.
98
- """
99
-
100
- # Filter the DataFrame based on the specified metrics
101
- df = df[df["task"].isin(metrics)]
102
-
103
- # Filter the human baselines based on the specified metrics
104
- filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
105
-
106
- # Create a line figure using plotly express with specified markers and custom data
107
- fig = px.line(
108
- df,
109
- x="date",
110
- y="score",
111
- color="task",
112
- markers=True,
113
- custom_data=["task", "score", "model"],
114
- title=title,
115
- )
116
-
117
- # Update hovertemplate for better hover interaction experience
118
- fig.update_traces(
119
- hovertemplate="<br>".join(
120
- [
121
- "Model Name: %{customdata[2]}",
122
- "Metric Name: %{customdata[0]}",
123
- "Date: %{x}",
124
- "Metric Value: %{y}",
125
- ]
126
- )
127
- )
128
-
129
- # Update the range of the y-axis
130
- fig.update_layout(yaxis_range=[0, 100])
131
-
132
- # Create a dictionary to hold the color mapping for each metric
133
- metric_color_mapping = {}
134
-
135
- # Map each metric name to its color in the figure
136
- for trace in fig.data:
137
- metric_color_mapping[trace.name] = trace.line.color
138
-
139
- # Iterate over filtered human baselines and add horizontal lines to the figure
140
- for metric, value in filtered_human_baselines.items():
141
- color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
142
- location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
143
- # Add horizontal line with matched color and positioned annotation
144
- fig.add_hline(
145
- y=value,
146
- line_dash="dot",
147
- annotation_text=f"{metric} human baseline",
148
- annotation_position=location,
149
- annotation_font_size=10,
150
- annotation_font_color=color,
151
- line_color=color,
152
- )
153
-
154
- return fig
155
-
156
-
157
- # Example Usage:
158
- # human_baselines dictionary is defined.
159
- # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
update_dynamic.py DELETED
@@ -1,4 +0,0 @@
1
- from src.scripts.update_all_request_files import update_dynamic_files
2
-
3
- if __name__ == "__main__":
4
- update_dynamic_files()