.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
- gif.gif filter=lfs diff=lfs merge=lfs -text
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -1,22 +1,15 @@
 
1
  venv/
2
- .venv/
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
8
- .DS_Store
9
- .ruff_cache/
10
- .python-version
11
- .profile_app.python
12
- *pstats
13
- *.lock
14
 
 
 
15
  eval-queue/
16
  eval-results/
17
- dynamic-info/
18
- downloads/
19
- model-votes/
20
- open-llm-leaderboard___contents/
21
 
22
  src/assets/model_counts.html
 
1
+ auto_evals/
2
  venv/
 
3
  __pycache__/
4
  .env
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
 
 
 
 
 
 
8
 
9
+ gpt_4_evals/
10
+ human_evals/
11
  eval-queue/
12
  eval-results/
13
+ auto_evals/
 
 
 
14
 
15
  src/assets/model_counts.html
Makefile CHANGED
@@ -1,18 +1,13 @@
1
- .PHONY: style format quality all
 
2
 
3
- # Applies code style fixes to the specified file or directory
4
  style:
5
- @echo "Applying style fixes to $(file)"
6
- ruff format $(file)
7
- ruff check --fix $(file) --line-length 119
8
 
9
- # Checks code quality for the specified file or directory
10
- quality:
11
- @echo "Checking code quality for $(file)"
12
- ruff check $(file) --line-length 119
13
 
14
- # Applies PEP8 formatting and checks the entire codebase
15
- all:
16
- @echo "Formatting and checking the entire codebase"
17
- ruff format .
18
- ruff check --fix . --line-length 119
 
1
+ .PHONY: style format
2
+
3
 
 
4
  style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
 
 
 
 
 
9
 
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
 
README.md CHANGED
@@ -1,25 +1,14 @@
1
  ---
2
- title: Open LLM Leaderboard 2
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- duplicated_from: open-llm-leaderboard/open_llm_leaderboard
12
- fullWidth: true
13
- startup_duration_timeout: 1h
14
- hf_oauth: true
15
- space_ci:
16
- private: true
17
- secrets:
18
- - HF_TOKEN
19
- - WEBHOOK_SECRET
20
- tags:
21
- - leaderboard
22
- short_description: Track, rank and evaluate open LLMs and chatbots
23
  ---
24
 
25
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Open LLM Leaderboard
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ duplicated_from: HuggingFaceH4/open_llm_leaderboard
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,328 +1,582 @@
 
1
  import os
2
- import logging
3
- import time
4
- import schedule
5
- import datetime
6
  import gradio as gr
7
- from threading import Thread
8
- import datasets
9
- from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
10
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
  from apscheduler.schedulers.background import BackgroundScheduler
 
12
 
13
- # Start ephemeral Spaces on PRs (see config in README.md)
14
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
15
-
16
- from src.display.about import (
17
  CITATION_BUTTON_LABEL,
18
  CITATION_BUTTON_TEXT,
19
  EVALUATION_QUEUE_TEXT,
20
- FAQ_TEXT,
21
  INTRODUCTION_TEXT,
22
  LLM_BENCHMARKS_TEXT,
23
  TITLE,
24
  )
25
- from src.display.css_html_js import custom_css
26
- from src.display.utils import (
27
- BENCHMARK_COLS,
28
- COLS,
29
- EVAL_COLS,
30
- EVAL_TYPES,
31
  AutoEvalColumn,
32
- ModelType,
33
- Precision,
34
- WeightType,
35
  fields,
36
- EvalQueueColumn
 
 
37
  )
38
- from src.envs import (
39
- API,
40
- EVAL_REQUESTS_PATH,
41
- AGGREGATED_REPO,
42
- HF_TOKEN,
43
- QUEUE_REPO,
44
- REPO_ID,
45
- VOTES_REPO,
46
- VOTES_PATH,
47
- HF_HOME,
48
- )
49
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
50
- from src.submission.submit import add_new_eval
51
- from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
52
- from src.voting.vote_system import VoteManager, run_scheduler
53
 
54
- # Configure logging
55
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
56
 
57
- # Start ephemeral Spaces on PRs (see config in README.md)
58
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
 
 
59
 
60
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
61
- # This controls whether a full initialization should be performed.
62
- DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
63
- NEW_DATA_ON_LEADERBOARD = True
64
- LEADERBOARD_DF = None
65
 
66
  def restart_space():
67
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
68
-
69
-
70
- def time_diff_wrapper(func):
71
- def wrapper(*args, **kwargs):
72
- start_time = time.time()
73
- result = func(*args, **kwargs)
74
- end_time = time.time()
75
- diff = end_time - start_time
76
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
77
- return result
78
-
79
- return wrapper
80
-
81
-
82
- @time_diff_wrapper
83
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
84
- """Download dataset with exponential backoff retries."""
85
- attempt = 0
86
- while attempt < max_attempts:
87
- try:
88
- logging.info(f"Downloading {repo_id} to {local_dir}")
89
- snapshot_download(
90
- repo_id=repo_id,
91
- local_dir=local_dir,
92
- repo_type=repo_type,
93
- tqdm_class=None,
94
- etag_timeout=30,
95
- max_workers=8,
96
- )
97
- logging.info("Download successful")
98
- return
99
- except Exception as e:
100
- wait_time = backoff_factor**attempt
101
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
102
- time.sleep(wait_time)
103
- attempt += 1
104
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
105
-
106
- def get_latest_data_leaderboard(leaderboard_initial_df = None):
107
- global NEW_DATA_ON_LEADERBOARD
108
- global LEADERBOARD_DF
109
- if NEW_DATA_ON_LEADERBOARD:
110
- print("Leaderboard updated at reload!")
111
- leaderboard_dataset = datasets.load_dataset(
112
- AGGREGATED_REPO,
113
- "default",
114
- split="train",
115
- cache_dir=HF_HOME,
116
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
117
- verification_mode="no_checks"
118
- )
119
- LEADERBOARD_DF = get_leaderboard_df(
120
- leaderboard_dataset=leaderboard_dataset,
121
- cols=COLS,
122
- benchmark_cols=BENCHMARK_COLS,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  )
124
- NEW_DATA_ON_LEADERBOARD = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- else:
127
- LEADERBOARD_DF = leaderboard_initial_df
128
-
129
- return LEADERBOARD_DF
130
-
131
-
132
- def get_latest_data_queue():
133
- eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
134
- return eval_queue_dfs
135
-
136
- def init_space():
137
- """Initializes the application space, loading only necessary data."""
138
- if DO_FULL_INIT:
139
- # These downloads only occur on full initialization
140
- try:
141
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
142
- download_dataset(VOTES_REPO, VOTES_PATH)
143
- except Exception:
144
- restart_space()
145
-
146
- # Always redownload the leaderboard DataFrame
147
- global LEADERBOARD_DF
148
- LEADERBOARD_DF = get_latest_data_leaderboard()
149
-
150
- # Evaluation queue DataFrame retrieval is independent of initialization detail level
151
- eval_queue_dfs = get_latest_data_queue()
152
-
153
- return LEADERBOARD_DF, eval_queue_dfs
154
-
155
- # Initialize VoteManager
156
- vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
157
-
158
-
159
- # Schedule the upload_votes method to run every 15 minutes
160
- schedule.every(15).minutes.do(vote_manager.upload_votes)
161
-
162
- # Start the scheduler in a separate thread
163
- scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
164
- scheduler_thread.start()
165
-
166
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
167
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
168
- LEADERBOARD_DF, eval_queue_dfs = init_space()
169
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
170
-
171
-
172
- # Data processing for plots now only on demand in the respective Gradio tab
173
- def load_and_create_plots():
174
- plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
175
- return plot_df
176
-
177
- # Function to check if a user is logged in
178
- def check_login(profile: gr.OAuthProfile | None) -> bool:
179
- if profile is None:
180
- return False
181
- return True
182
-
183
- def init_leaderboard(dataframe):
184
- if dataframe is None or dataframe.empty:
185
- raise ValueError("Leaderboard DataFrame is empty or None.")
186
- return Leaderboard(
187
- value=dataframe,
188
- datatype=[c.type for c in fields(AutoEvalColumn)],
189
- select_columns=SelectColumns(
190
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
191
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
192
- label="Select Columns to Display:",
193
- ),
194
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
195
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
196
- filter_columns=[
197
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
198
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
199
- ColumnFilter(
200
- AutoEvalColumn.params.name,
201
- type="slider",
202
- min=0.01,
203
- max=150,
204
- label="Select the number of parameters (B)",
205
- ),
206
- ColumnFilter(
207
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
208
- ),
209
- ColumnFilter(
210
- AutoEvalColumn.merged.name, type="boolean", label="Merge/MoErge", default=True
211
- ),
212
- ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
213
- ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
214
- ColumnFilter(AutoEvalColumn.maintainers_highlight.name, type="boolean", label="Show only maintainer's highlight", default=False),
215
- ],
216
- bool_checkboxgroup_label="Hide models",
217
- interactive=False,
218
  )
219
 
220
- main_block = gr.Blocks(css=custom_css)
221
- with main_block:
222
- with gr.Row(elem_id="header-row"):
223
- gr.HTML(TITLE)
224
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
226
 
227
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
228
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
229
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  with gr.Column():
233
  with gr.Row():
234
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  with gr.Row():
237
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
238
 
239
  with gr.Row():
240
  with gr.Column():
241
  model_name_textbox = gr.Textbox(label="Model name")
242
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="latest")
243
- with gr.Row():
244
- model_type = gr.Dropdown(
245
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
246
- label="Model type",
247
- multiselect=False,
248
- value=ModelType.FT.to_str(" : "),
249
- interactive=True,
250
- )
251
- chat_template_toggle = gr.Checkbox(
252
- label="Use chat template",
253
- value=False,
254
- info="Is your model a chat model?",
255
- )
256
 
257
  with gr.Column():
258
  precision = gr.Dropdown(
259
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
260
  label="Precision",
261
  multiselect=False,
262
  value="float16",
263
  interactive=True,
264
  )
265
  weight_type = gr.Dropdown(
266
- choices=[i.value.name for i in WeightType],
267
  label="Weights type",
268
  multiselect=False,
269
  value="Original",
270
  interactive=True,
271
  )
272
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
273
-
274
- with gr.Column():
275
- with gr.Accordion(
276
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
277
- open=False,
278
- ):
279
- with gr.Row():
280
- finished_eval_table = gr.components.Dataframe(
281
- value=finished_eval_queue_df,
282
- headers=EVAL_COLS,
283
- datatype=EVAL_TYPES,
284
- row_count=5,
285
- interactive=False,
286
- )
287
- with gr.Accordion(
288
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
289
- open=False,
290
- ):
291
- with gr.Row():
292
- running_eval_table = gr.components.Dataframe(
293
- value=running_eval_queue_df,
294
- headers=EVAL_COLS,
295
- datatype=EVAL_TYPES,
296
- row_count=5,
297
- interactive=False,
298
- )
299
-
300
- with gr.Accordion(
301
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
302
- open=False,
303
- ):
304
- with gr.Row():
305
- pending_eval_table = gr.components.Dataframe(
306
- value=pending_eval_queue_df,
307
- headers=EVAL_COLS,
308
- datatype=EVAL_TYPES,
309
- row_count=5,
310
- interactive=False,
311
- )
312
 
313
  submit_button = gr.Button("Submit Eval")
314
  submission_result = gr.Markdown()
315
-
316
- # The chat template checkbox update function
317
- def update_chat_checkbox(model_type_value):
318
- return ModelType.from_str(model_type_value) == ModelType.chat
319
-
320
- model_type.change(
321
- fn=update_chat_checkbox,
322
- inputs=[model_type], # Pass the current checkbox value
323
- outputs=chat_template_toggle,
324
- )
325
-
326
  submit_button.click(
327
  add_new_eval,
328
  [
@@ -330,61 +584,13 @@ with main_block:
330
  base_model_name_textbox,
331
  revision_name_textbox,
332
  precision,
 
333
  weight_type,
334
  model_type,
335
- chat_template_toggle,
336
  ],
337
  submission_result,
338
  )
339
 
340
- # Ensure the values in 'pending_eval_queue_df' are correct and ready for the DataFrame component
341
- with gr.TabItem("🆙 Model Vote"):
342
- with gr.Row():
343
- gr.Markdown(
344
- "## Vote for the models which should be evaluated first! \nYou'll need to sign in with the button above first. All votes are recorded.",
345
- elem_classes="markdown-text"
346
- )
347
- login_button = gr.LoginButton(elem_id="oauth-button")
348
-
349
-
350
- with gr.Row():
351
- pending_models = pending_eval_queue_df[EvalQueueColumn.model_name.name].to_list()
352
-
353
- with gr.Column():
354
- selected_model = gr.Dropdown(
355
- choices=pending_models,
356
- label="Models",
357
- multiselect=False,
358
- value="str",
359
- interactive=True,
360
- )
361
-
362
- vote_button = gr.Button("Vote", variant="primary")
363
-
364
- with gr.Row():
365
- with gr.Accordion(
366
- f"Available models pending ({len(pending_eval_queue_df)})",
367
- open=True,
368
- ):
369
- with gr.Row():
370
- pending_eval_table_votes = gr.components.Dataframe(
371
- value=vote_manager.create_request_vote_df(
372
- pending_eval_queue_df
373
- ),
374
- headers=EVAL_COLS,
375
- datatype=EVAL_TYPES,
376
- row_count=5,
377
- interactive=False
378
- )
379
-
380
- # Set the click event for the vote button
381
- vote_button.click(
382
- vote_manager.add_vote,
383
- inputs=[selected_model, pending_eval_table],
384
- outputs=[pending_eval_table_votes]
385
- )
386
-
387
-
388
  with gr.Row():
389
  with gr.Accordion("📙 Citation", open=False):
390
  citation_button = gr.Textbox(
@@ -392,81 +598,17 @@ with main_block:
392
  label=CITATION_BUTTON_LABEL,
393
  lines=20,
394
  elem_id="citation-button",
395
- show_copy_button=True,
396
- )
397
-
398
- main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
399
- leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
400
- pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
401
-
402
- main_block.queue(default_concurrency_limit=40)
403
-
404
-
405
- def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
406
- # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
407
- # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
408
- # ht to Lucain!
409
- if SPACE_ID is None:
410
- print("Not in a Space: Space CI disabled.")
411
- return WebhooksServer(ui=main_block)
412
-
413
- if IS_EPHEMERAL_SPACE:
414
- print("In an ephemeral Space: Space CI disabled.")
415
- return WebhooksServer(ui=main_block)
416
-
417
- card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
418
- config = card.data.get("space_ci", {})
419
- print(f"Enabling Space CI with config from README: {config}")
420
-
421
- return configure_space_ci(
422
- blocks=ui,
423
- trusted_authors=config.get("trusted_authors"),
424
- private=config.get("private", "auto"),
425
- variables=config.get("variables", "auto"),
426
- secrets=config.get("secrets"),
427
- hardware=config.get("hardware"),
428
- storage=config.get("storage"),
429
  )
430
 
431
- # Create webhooks server (with CI url if in Space and not ephemeral)
432
- webhooks_server = enable_space_ci_and_return_server(ui=main_block)
433
-
434
- # Add webhooks
435
- @webhooks_server.add_webhook
436
- def update_leaderboard(payload: WebhookPayload) -> None:
437
- """Redownloads the leaderboard dataset each time it updates"""
438
- if payload.repo.type == "dataset" and payload.event.action == "update":
439
- global NEW_DATA_ON_LEADERBOARD
440
- if NEW_DATA_ON_LEADERBOARD:
441
- return
442
- NEW_DATA_ON_LEADERBOARD = True
443
-
444
- datasets.load_dataset(
445
- AGGREGATED_REPO,
446
- "default",
447
- split="train",
448
- cache_dir=HF_HOME,
449
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
450
- verification_mode="no_checks"
451
- )
452
-
453
- # The below code is not used at the moment, as we can manage the queue file locally
454
- LAST_UPDATE_QUEUE = datetime.datetime.now()
455
- @webhooks_server.add_webhook
456
- def update_queue(payload: WebhookPayload) -> None:
457
- """Redownloads the queue dataset each time it updates"""
458
- if payload.repo.type == "dataset" and payload.event.action == "update":
459
- current_time = datetime.datetime.now()
460
- global LAST_UPDATE_QUEUE
461
- if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
462
- print("Would have updated the queue")
463
- # We only redownload is last update was more than 10 minutes ago, as the queue is
464
- # updated regularly and heavy to download
465
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
466
- LAST_UPDATE_QUEUE = datetime.datetime.now()
467
-
468
- webhooks_server.launch()
469
-
470
  scheduler = BackgroundScheduler()
471
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
472
- scheduler.start()
 
 
1
+ import json
2
  import os
3
+ from datetime import datetime, timezone
4
+
 
 
5
  import gradio as gr
6
+ import pandas as pd
 
 
 
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
+ from huggingface_hub import HfApi
9
 
10
+ from src.assets.css_html_js import custom_css, get_window_url_params
11
+ from src.assets.text_content import (
 
 
12
  CITATION_BUTTON_LABEL,
13
  CITATION_BUTTON_TEXT,
14
  EVALUATION_QUEUE_TEXT,
 
15
  INTRODUCTION_TEXT,
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
+ from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
20
+ from src.display_models.modelcard_filter import check_model_card
21
+ from src.display_models.utils import (
 
 
 
22
  AutoEvalColumn,
23
+ EvalQueueColumn,
 
 
24
  fields,
25
+ styled_error,
26
+ styled_message,
27
+ styled_warning,
28
  )
29
+ from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
30
+ from src.rate_limiting import user_submission_permission
31
+
32
+ pd.set_option("display.precision", 1)
33
+
34
+ # clone / pull the lmeh eval data
35
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
36
+
37
+ QUEUE_REPO = "open-llm-leaderboard/requests"
38
+ RESULTS_REPO = "open-llm-leaderboard/results"
39
+
40
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
41
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
42
+
43
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
44
 
45
+ EVAL_REQUESTS_PATH = "eval-queue"
46
+ EVAL_RESULTS_PATH = "eval-results"
47
 
48
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
49
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
50
+
51
+ api = HfApi(token=H4_TOKEN)
52
 
 
 
 
 
 
53
 
54
  def restart_space():
55
+ api.restart_space(repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN)
56
+
57
+
58
+ # Rate limit variables
59
+ RATE_LIMIT_PERIOD = 7
60
+ RATE_LIMIT_QUOTA = 5
61
+
62
+ # Column selection
63
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
64
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
65
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
66
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
67
+
68
+ if not IS_PUBLIC:
69
+ COLS.insert(2, AutoEvalColumn.precision.name)
70
+ TYPES.insert(2, AutoEvalColumn.precision.type)
71
+
72
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
73
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
74
+
75
+ BENCHMARK_COLS = [
76
+ c.name
77
+ for c in [
78
+ AutoEvalColumn.arc,
79
+ AutoEvalColumn.hellaswag,
80
+ AutoEvalColumn.mmlu,
81
+ AutoEvalColumn.truthfulqa,
82
+ ]
83
+ ]
84
+
85
+ ## LOAD INFO FROM HUB
86
+ eval_queue, requested_models, eval_results, users_to_submission_dates = load_all_info_from_hub(
87
+ QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
88
+ )
89
+
90
+ if not IS_PUBLIC:
91
+ (eval_queue_private, requested_models_private, eval_results_private, _) = load_all_info_from_hub(
92
+ PRIVATE_QUEUE_REPO,
93
+ PRIVATE_RESULTS_REPO,
94
+ EVAL_REQUESTS_PATH_PRIVATE,
95
+ EVAL_RESULTS_PATH_PRIVATE,
96
+ )
97
+ else:
98
+ eval_queue_private, eval_results_private = None, None
99
+
100
+ original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
101
+ models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
102
+
103
+ to_be_dumped = f"models = {repr(models)}\n"
104
+
105
+ leaderboard_df = original_df.copy()
106
+ (
107
+ finished_eval_queue_df,
108
+ running_eval_queue_df,
109
+ pending_eval_queue_df,
110
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
111
+
112
+
113
+ ## INTERACTION FUNCTIONS
114
+ def add_new_eval(
115
+ model: str,
116
+ base_model: str,
117
+ revision: str,
118
+ precision: str,
119
+ private: bool,
120
+ weight_type: str,
121
+ model_type: str,
122
+ ):
123
+ precision = precision.split(" ")[0]
124
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
125
+
126
+ if model_type is None or model_type == "":
127
+ return styled_error("Please select a model type.")
128
+
129
+ # Is the user rate limited?
130
+ num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
131
+ if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
132
+ error_msg = f"Organisation or user `{model.split('/')[0]}`"
133
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
134
+ error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
135
+ error_msg += (
136
+ "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
137
  )
138
+ return styled_error(error_msg)
139
+
140
+ # Did the model authors forbid its submission to the leaderboard?
141
+ if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
142
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
143
+
144
+ # Does the model actually exist?
145
+ if revision == "":
146
+ revision = "main"
147
+
148
+ if weight_type in ["Delta", "Adapter"]:
149
+ base_model_on_hub, error = is_model_on_hub(base_model, revision)
150
+ if not base_model_on_hub:
151
+ return styled_error(f'Base model "{base_model}" {error}')
152
+
153
+ if not weight_type == "Adapter":
154
+ model_on_hub, error = is_model_on_hub(model, revision)
155
+ if not model_on_hub:
156
+ return styled_error(f'Model "{model}" {error}')
157
+
158
+ # Were the model card and license filled?
159
+ modelcard_OK, error_msg = check_model_card(model)
160
+ if not modelcard_OK:
161
+ return styled_error(error_msg)
162
+
163
+ # Seems good, creating the eval
164
+ print("Adding new eval")
165
+
166
+ eval_entry = {
167
+ "model": model,
168
+ "base_model": base_model,
169
+ "revision": revision,
170
+ "private": private,
171
+ "precision": precision,
172
+ "weight_type": weight_type,
173
+ "status": "PENDING",
174
+ "submitted_time": current_time,
175
+ "model_type": model_type,
176
+ }
177
+
178
+ user_name = ""
179
+ model_path = model
180
+ if "/" in model:
181
+ user_name = model.split("/")[0]
182
+ model_path = model.split("/")[1]
183
+
184
+ print("Creating eval file")
185
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
186
+ os.makedirs(OUT_DIR, exist_ok=True)
187
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
188
+
189
+ # Check for duplicate submission
190
+ if f"{model}_{revision}_{precision}" in requested_models:
191
+ return styled_warning("This model has been already submitted.")
192
+
193
+ with open(out_path, "w") as f:
194
+ f.write(json.dumps(eval_entry))
195
+
196
+ print("Uploading eval file")
197
+ api.upload_file(
198
+ path_or_fileobj=out_path,
199
+ path_in_repo=out_path.split("eval-queue/")[1],
200
+ repo_id=QUEUE_REPO,
201
+ repo_type="dataset",
202
+ commit_message=f"Add {model} to eval queue",
203
+ )
204
 
205
+ # Remove the local file
206
+ os.remove(out_path)
207
+
208
+ return styled_message(
209
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  )
211
 
212
+
213
+ # Basics
214
+ def change_tab(query_param: str):
215
+ query_param = query_param.replace("'", '"')
216
+ query_param = json.loads(query_param)
217
+
218
+ if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
219
+ return gr.Tabs.update(selected=1)
220
+ else:
221
+ return gr.Tabs.update(selected=0)
222
+
223
+
224
+ # Searching and filtering
225
+ def update_table(
226
+ hidden_df: pd.DataFrame,
227
+ current_columns_df: pd.DataFrame,
228
+ columns: list,
229
+ type_query: list,
230
+ precision_query: str,
231
+ size_query: list,
232
+ show_deleted: bool,
233
+ query: str,
234
+ ):
235
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
236
+ final_df = []
237
+ if query != "":
238
+ queries = query.split(";")
239
+ for _q in queries:
240
+ if _q != "":
241
+ temp_filtered_df = search_table(filtered_df, _q)
242
+ if len(temp_filtered_df) > 0:
243
+ final_df.append(temp_filtered_df)
244
+ if len(final_df) > 0:
245
+ filtered_df = pd.concat(final_df).drop_duplicates()
246
+ df = select_columns(filtered_df, columns)
247
+ return df
248
+
249
+
250
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
251
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
252
+
253
+
254
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
255
+ always_here_cols = [
256
+ AutoEvalColumn.model_type_symbol.name,
257
+ AutoEvalColumn.model.name,
258
+ ]
259
+ # We use COLS to maintain sorting
260
+ filtered_df = df[
261
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
262
+ ]
263
+ return filtered_df
264
+
265
+
266
+ NUMERIC_INTERVALS = {
267
+ "Unknown": pd.Interval(-1, 0, closed="right"),
268
+ "< 1.5B": pd.Interval(0, 1.5, closed="right"),
269
+ "~3B": pd.Interval(1.5, 5, closed="right"),
270
+ "~7B": pd.Interval(6, 11, closed="right"),
271
+ "~13B": pd.Interval(12, 15, closed="right"),
272
+ "~35B": pd.Interval(16, 55, closed="right"),
273
+ "60B+": pd.Interval(55, 10000, closed="right"),
274
+ }
275
+
276
+
277
+ def filter_models(
278
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
279
+ ) -> pd.DataFrame:
280
+ # Show all models
281
+ if show_deleted:
282
+ filtered_df = df
283
+ else: # Show only still on the hub models
284
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
285
+
286
+ type_emoji = [t[0] for t in type_query]
287
+ filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
288
+ filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
289
+
290
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
291
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
292
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
293
+ filtered_df = filtered_df.loc[mask]
294
+
295
+ return filtered_df
296
+
297
+
298
+ demo = gr.Blocks(css=custom_css)
299
+ with demo:
300
+ gr.HTML(TITLE)
301
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
302
 
303
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
304
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
305
+ with gr.Row():
306
+ with gr.Column():
307
+ with gr.Row():
308
+ search_bar = gr.Textbox(
309
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
310
+ show_label=False,
311
+ elem_id="search-bar",
312
+ )
313
+ with gr.Row():
314
+ shown_columns = gr.CheckboxGroup(
315
+ choices=[
316
+ c
317
+ for c in COLS
318
+ if c
319
+ not in [
320
+ AutoEvalColumn.dummy.name,
321
+ AutoEvalColumn.model.name,
322
+ AutoEvalColumn.model_type_symbol.name,
323
+ AutoEvalColumn.still_on_hub.name,
324
+ ]
325
+ ],
326
+ value=[
327
+ c
328
+ for c in COLS_LITE
329
+ if c
330
+ not in [
331
+ AutoEvalColumn.dummy.name,
332
+ AutoEvalColumn.model.name,
333
+ AutoEvalColumn.model_type_symbol.name,
334
+ AutoEvalColumn.still_on_hub.name,
335
+ ]
336
+ ],
337
+ label="Select columns to show",
338
+ elem_id="column-select",
339
+ interactive=True,
340
+ )
341
+ with gr.Row():
342
+ deleted_models_visibility = gr.Checkbox(
343
+ value=True, label="Show gated/private/deleted models", interactive=True
344
+ )
345
+ with gr.Column(min_width=320):
346
+ with gr.Box(elem_id="box-filter"):
347
+ filter_columns_type = gr.CheckboxGroup(
348
+ label="Model types",
349
+ choices=[
350
+ ModelType.PT.to_str(),
351
+ ModelType.FT.to_str(),
352
+ ModelType.IFT.to_str(),
353
+ ModelType.RL.to_str(),
354
+ ModelType.Unknown.to_str(),
355
+ ],
356
+ value=[
357
+ ModelType.PT.to_str(),
358
+ ModelType.FT.to_str(),
359
+ ModelType.IFT.to_str(),
360
+ ModelType.RL.to_str(),
361
+ ModelType.Unknown.to_str(),
362
+ ],
363
+ interactive=True,
364
+ elem_id="filter-columns-type",
365
+ )
366
+ filter_columns_precision = gr.CheckboxGroup(
367
+ label="Precision",
368
+ choices=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
369
+ value=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
370
+ interactive=True,
371
+ elem_id="filter-columns-precision",
372
+ )
373
+ filter_columns_size = gr.CheckboxGroup(
374
+ label="Model sizes",
375
+ choices=list(NUMERIC_INTERVALS.keys()),
376
+ value=list(NUMERIC_INTERVALS.keys()),
377
+ interactive=True,
378
+ elem_id="filter-columns-size",
379
+ )
380
 
381
+ leaderboard_table = gr.components.Dataframe(
382
+ value=leaderboard_df[
383
+ [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
384
+ + shown_columns.value
385
+ + [AutoEvalColumn.dummy.name]
386
+ ],
387
+ headers=[
388
+ AutoEvalColumn.model_type_symbol.name,
389
+ AutoEvalColumn.model.name,
390
+ ]
391
+ + shown_columns.value
392
+ + [AutoEvalColumn.dummy.name],
393
+ datatype=TYPES,
394
+ max_rows=None,
395
+ elem_id="leaderboard-table",
396
+ interactive=False,
397
+ visible=True,
398
+ )
399
+
400
+ # Dummy leaderboard for handling the case when the user uses backspace key
401
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
402
+ value=original_df,
403
+ headers=COLS,
404
+ datatype=TYPES,
405
+ max_rows=None,
406
+ visible=False,
407
+ )
408
+ search_bar.submit(
409
+ update_table,
410
+ [
411
+ hidden_leaderboard_table_for_search,
412
+ leaderboard_table,
413
+ shown_columns,
414
+ filter_columns_type,
415
+ filter_columns_precision,
416
+ filter_columns_size,
417
+ deleted_models_visibility,
418
+ search_bar,
419
+ ],
420
+ leaderboard_table,
421
+ )
422
+ shown_columns.change(
423
+ update_table,
424
+ [
425
+ hidden_leaderboard_table_for_search,
426
+ leaderboard_table,
427
+ shown_columns,
428
+ filter_columns_type,
429
+ filter_columns_precision,
430
+ filter_columns_size,
431
+ deleted_models_visibility,
432
+ search_bar,
433
+ ],
434
+ leaderboard_table,
435
+ queue=True,
436
+ )
437
+ filter_columns_type.change(
438
+ update_table,
439
+ [
440
+ hidden_leaderboard_table_for_search,
441
+ leaderboard_table,
442
+ shown_columns,
443
+ filter_columns_type,
444
+ filter_columns_precision,
445
+ filter_columns_size,
446
+ deleted_models_visibility,
447
+ search_bar,
448
+ ],
449
+ leaderboard_table,
450
+ queue=True,
451
+ )
452
+ filter_columns_precision.change(
453
+ update_table,
454
+ [
455
+ hidden_leaderboard_table_for_search,
456
+ leaderboard_table,
457
+ shown_columns,
458
+ filter_columns_type,
459
+ filter_columns_precision,
460
+ filter_columns_size,
461
+ deleted_models_visibility,
462
+ search_bar,
463
+ ],
464
+ leaderboard_table,
465
+ queue=True,
466
+ )
467
+ filter_columns_size.change(
468
+ update_table,
469
+ [
470
+ hidden_leaderboard_table_for_search,
471
+ leaderboard_table,
472
+ shown_columns,
473
+ filter_columns_type,
474
+ filter_columns_precision,
475
+ filter_columns_size,
476
+ deleted_models_visibility,
477
+ search_bar,
478
+ ],
479
+ leaderboard_table,
480
+ queue=True,
481
+ )
482
+ deleted_models_visibility.change(
483
+ update_table,
484
+ [
485
+ hidden_leaderboard_table_for_search,
486
+ leaderboard_table,
487
+ shown_columns,
488
+ filter_columns_type,
489
+ filter_columns_precision,
490
+ filter_columns_size,
491
+ deleted_models_visibility,
492
+ search_bar,
493
+ ],
494
+ leaderboard_table,
495
+ queue=True,
496
+ )
497
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
498
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
499
+
500
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
501
  with gr.Column():
502
  with gr.Row():
503
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
504
 
505
+ with gr.Column():
506
+ with gr.Accordion(
507
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
508
+ open=False,
509
+ ):
510
+ with gr.Row():
511
+ finished_eval_table = gr.components.Dataframe(
512
+ value=finished_eval_queue_df,
513
+ headers=EVAL_COLS,
514
+ datatype=EVAL_TYPES,
515
+ max_rows=5,
516
+ )
517
+ with gr.Accordion(
518
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
519
+ open=False,
520
+ ):
521
+ with gr.Row():
522
+ running_eval_table = gr.components.Dataframe(
523
+ value=running_eval_queue_df,
524
+ headers=EVAL_COLS,
525
+ datatype=EVAL_TYPES,
526
+ max_rows=5,
527
+ )
528
+
529
+ with gr.Accordion(
530
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
531
+ open=False,
532
+ ):
533
+ with gr.Row():
534
+ pending_eval_table = gr.components.Dataframe(
535
+ value=pending_eval_queue_df,
536
+ headers=EVAL_COLS,
537
+ datatype=EVAL_TYPES,
538
+ max_rows=5,
539
+ )
540
  with gr.Row():
541
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
542
 
543
  with gr.Row():
544
  with gr.Column():
545
  model_name_textbox = gr.Textbox(label="Model name")
546
+ revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
547
+ private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
548
+ model_type = gr.Dropdown(
549
+ choices=[
550
+ ModelType.PT.to_str(" : "),
551
+ ModelType.FT.to_str(" : "),
552
+ ModelType.IFT.to_str(" : "),
553
+ ModelType.RL.to_str(" : "),
554
+ ],
555
+ label="Model type",
556
+ multiselect=False,
557
+ value=None,
558
+ interactive=True,
559
+ )
560
 
561
  with gr.Column():
562
  precision = gr.Dropdown(
563
+ choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ"],
564
  label="Precision",
565
  multiselect=False,
566
  value="float16",
567
  interactive=True,
568
  )
569
  weight_type = gr.Dropdown(
570
+ choices=["Original", "Delta", "Adapter"],
571
  label="Weights type",
572
  multiselect=False,
573
  value="Original",
574
  interactive=True,
575
  )
576
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  submit_button = gr.Button("Submit Eval")
579
  submission_result = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
580
  submit_button.click(
581
  add_new_eval,
582
  [
 
584
  base_model_name_textbox,
585
  revision_name_textbox,
586
  precision,
587
+ private,
588
  weight_type,
589
  model_type,
 
590
  ],
591
  submission_result,
592
  )
593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
  with gr.Row():
595
  with gr.Accordion("📙 Citation", open=False):
596
  citation_button = gr.Textbox(
 
598
  label=CITATION_BUTTON_LABEL,
599
  lines=20,
600
  elem_id="citation-button",
601
+ ).style(show_copy_button=True)
602
+
603
+ dummy = gr.Textbox(visible=False)
604
+ demo.load(
605
+ change_tab,
606
+ dummy,
607
+ tabs,
608
+ _js=get_window_url_params,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  )
610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  scheduler = BackgroundScheduler()
612
+ scheduler.add_job(restart_space, "interval", seconds=1800)
613
+ scheduler.start()
614
+ demo.queue(concurrency_count=40).launch()
model_info_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fcaa2a3e1ac6a5559471547af5de4e3ccd49673ad5525890726e65cd90cfe62
3
+ size 3620752
model_size_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75d1f64589459eb64e3a50987bf05ed3656248102d1fe2f6c98a008020945840
3
+ size 74321
src/tools/model_backlinks.py → models_backlinks.py RENAMED
@@ -630,7 +630,7 @@ models = [
630
  "WizardLM/WizardMath-7B-V1.0",
631
  "Norquinal/llama-2-7b-claude-chat",
632
  "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
633
- "open-llm-leaderboard/starchat-beta",
634
  "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
635
  "conceptofmind/LLongMA-2-13b-16k",
636
  "tianyil1/denas-llama2",
@@ -1039,7 +1039,7 @@ models = [
1039
  "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
1040
  "EleutherAI/gpt-neo-2.7B",
1041
  "danielhanchen/open_llama_3b_600bt_preview",
1042
- "open-llm-leaderboard/starchat-alpha",
1043
  "pythainlp/wangchanglm-7.5B-sft-en-sharded",
1044
  "beaugogh/pythia-1.4b-deduped-sharegpt",
1045
  "HWERI/pythia-1.4b-deduped-sharegpt",
 
630
  "WizardLM/WizardMath-7B-V1.0",
631
  "Norquinal/llama-2-7b-claude-chat",
632
  "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
633
+ "HuggingFaceH4/starchat-beta",
634
  "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
635
  "conceptofmind/LLongMA-2-13b-16k",
636
  "tianyil1/denas-llama2",
 
1039
  "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
1040
  "EleutherAI/gpt-neo-2.7B",
1041
  "danielhanchen/open_llama_3b_600bt_preview",
1042
+ "HuggingFaceH4/starchat-alpha",
1043
  "pythainlp/wangchanglm-7.5B-sft-en-sharded",
1044
  "beaugogh/pythia-1.4b-deduped-sharegpt",
1045
  "HWERI/pythia-1.4b-deduped-sharegpt",
pyproject.toml CHANGED
@@ -1,15 +1,9 @@
1
  [tool.ruff]
2
- line-length = 120
3
- target-version = "py312"
4
- include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
5
- ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004","D107","FA102"]
6
- fixable=["ALL"]
7
- select=["ALL"]
8
-
9
- [tool.ruff.lint]
10
  select = ["E", "F"]
11
- fixable = ["ALL"]
12
  ignore = ["E501"] # line too long (black is taking care of this)
 
 
13
 
14
  [tool.isort]
15
  profile = "black"
@@ -17,40 +11,3 @@ line_length = 119
17
 
18
  [tool.black]
19
  line-length = 119
20
-
21
- [tool.poetry]
22
- package-mode = false
23
- name = "open-llm-leaderboard"
24
- version = "0.1.0"
25
- description = ""
26
- authors = []
27
- readme = "README.md"
28
-
29
- [tool.poetry.dependencies]
30
- python = "3.12.1"
31
- apscheduler = "3.10.1"
32
- black = "23.11.0"
33
- click = "8.1.3"
34
- datasets = "2.14.5"
35
- huggingface-hub = ">=0.18.0"
36
- matplotlib = "3.8.4"
37
- numpy = "1.26.0"
38
- pandas = "2.2.2"
39
- plotly = "5.14.1"
40
- python-dateutil = "2.8.2"
41
- sentencepiece = "^0.2.0"
42
- tqdm = "4.65.0"
43
- transformers = "4.41.1"
44
- tokenizers = ">=0.15.0"
45
- gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
46
- isort = "^5.13.2"
47
- ruff = "^0.3.5"
48
- gradio-leaderboard = "0.0.8"
49
- gradio = {extras = ["oauth"], version = "^4.36.1"}
50
- requests = "^2.31.0"
51
- requests-oauthlib = "^1.3.1"
52
- schedule = "^1.2.2"
53
-
54
- [build-system]
55
- requires = ["poetry-core"]
56
- build-backend = "poetry.core.masonry.api"
 
1
  [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 
 
 
 
 
 
 
3
  select = ["E", "F"]
 
4
  ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
 
11
 
12
  [tool.black]
13
  line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,23 +1,71 @@
 
 
 
 
 
 
1
  APScheduler==3.10.1
2
- black==23.11.0
 
 
 
3
  click==8.1.3
4
- datasets==2.14.5
5
- huggingface-hub>=0.18.0
6
- matplotlib==3.8.4
7
- numpy==1.26.0
8
- pandas==2.2.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  plotly==5.14.1
 
 
 
 
 
10
  python-dateutil==2.8.2
11
- sentencepiece
 
 
 
 
 
 
 
 
 
12
  tqdm==4.65.0
13
- transformers==4.41.1
14
- tokenizers>=0.15.0
15
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
16
- isort
17
- ruff
18
- gradio==4.31.0
19
- gradio[oauth]
20
- gradio_leaderboard==0.0.9
21
- requests==2.31.0
22
- requests-oauthlib== 1.3.1
23
- schedule == 1.2.2
 
1
+ accelerate==0.23.0
2
+ aiofiles==23.1.0
3
+ aiohttp==3.8.4
4
+ aiosignal==1.3.1
5
+ altair==4.2.2
6
+ anyio==3.6.2
7
  APScheduler==3.10.1
8
+ async-timeout==4.0.2
9
+ attrs==23.1.0
10
+ certifi==2022.12.7
11
+ charset-normalizer==3.1.0
12
  click==8.1.3
13
+ contourpy==1.0.7
14
+ cycler==0.11.0
15
+ datasets==2.12.0
16
+ entrypoints==0.4
17
+ fastapi==0.95.1
18
+ ffmpy==0.3.0
19
+ filelock==3.11.0
20
+ fonttools==4.39.3
21
+ frozenlist==1.3.3
22
+ fsspec==2023.4.0
23
+ gradio==3.43.2
24
+ gradio-client==0.5.0
25
+ h11==0.14.0
26
+ httpcore==0.17.0
27
+ httpx==0.24.0
28
+ huggingface-hub==0.16.4
29
+ idna==3.4
30
+ Jinja2==3.1.2
31
+ jsonschema==4.17.3
32
+ kiwisolver==1.4.4
33
+ linkify-it-py==2.0.0
34
+ markdown-it-py==2.2.0
35
+ MarkupSafe==2.1.2
36
+ matplotlib==3.7.1
37
+ mdit-py-plugins==0.3.3
38
+ mdurl==0.1.2
39
+ multidict==6.0.4
40
+ numpy==1.24.2
41
+ orjson==3.8.10
42
+ packaging==23.1
43
+ pandas==2.0.0
44
+ Pillow==9.5.0
45
  plotly==5.14.1
46
+ pyarrow==11.0.0
47
+ pydantic==1.10.7
48
+ pydub==0.25.1
49
+ pyparsing==3.0.9
50
+ pyrsistent==0.19.3
51
  python-dateutil==2.8.2
52
+ python-multipart==0.0.6
53
+ pytz==2023.3
54
+ pytz-deprecation-shim==0.1.0.post0
55
+ PyYAML==6.0
56
+ requests==2.28.2
57
+ semantic-version==2.10.0
58
+ six==1.16.0
59
+ sniffio==1.3.0
60
+ starlette==0.26.1
61
+ toolz==0.12.0
62
  tqdm==4.65.0
63
+ transformers@git+https://github.com/huggingface/transformers
64
+ typing_extensions==4.5.0
65
+ tzdata==2023.3
66
+ tzlocal==4.3
67
+ uc-micro-py==1.0.1
68
+ urllib3==1.26.15
69
+ uvicorn==0.21.1
70
+ websockets==11.0.1
71
+ yarl==1.8.2
 
 
src/{display → assets}/css_html_js.py RENAMED
@@ -1,18 +1,5 @@
1
  custom_css = """
2
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
3
- table td:first-child,
4
- table th:first-child {
5
- max-width: 400px;
6
- overflow: auto;
7
- white-space: nowrap;
8
- }
9
 
10
- /* Full width space */
11
- .gradio-container {
12
- max-width: 95% !important;
13
- }
14
-
15
- /* Text style and margins */
16
  .markdown-text {
17
  font-size: 16px !important;
18
  }
@@ -34,21 +21,54 @@ table th:first-child {
34
  transform: scale(1.3);
35
  }
36
 
 
 
 
 
 
 
 
 
37
  #search-bar-table-box > div:first-child {
38
  background: none;
39
  border: none;
40
  }
41
-
42
  #search-bar {
43
  padding: 0px;
44
  }
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  .tab-buttons button {
47
  font-size: 20px;
48
  }
49
 
50
- /* Filters style */
51
- #filter_type {
 
 
 
 
 
 
 
 
 
 
 
52
  border: 0;
53
  padding-left: 0;
54
  padding-top: 0;
@@ -56,53 +76,29 @@ table th:first-child {
56
  #filter_type label {
57
  display: flex;
58
  }
59
- #filter_type label > span {
60
  margin-top: var(--spacing-lg);
61
  margin-right: 0.5em;
62
  }
63
- #filter_type label > .wrap {
64
  width: 103px;
65
  }
66
- #filter_type label > .wrap .wrap-inner {
67
  padding: 2px;
68
  }
69
- #filter_type label > .wrap .wrap-inner input {
70
- width: 1px;
71
- }
72
- #filter-columns-type {
73
- border: 0;
74
- padding: 0.5;
75
- }
76
- #filter-columns-size {
77
- border: 0;
78
- padding: 0.5;
79
- }
80
- #box-filter > .form {
81
- border: 0;
82
- }
83
-
84
- /* Header styles */
85
- #header-title {
86
- text-align: left;
87
- display: inline-block;
88
  }
89
-
90
- #header-row {
91
- display: flex;
92
- justify-content: space-between;
93
- align-items: center;
94
  }
95
-
96
- #header-row .gradio-html {
97
- flex-grow: 1;
98
  }
99
-
100
- #oauth-button {
101
- height: auto;
102
- min-width: max-content;
103
- white-space: nowrap;
104
- padding: 10px 20px;
105
- border-radius: 4px;
106
  }
107
  """
108
 
@@ -112,4 +108,4 @@ get_window_url_params = """
112
  url_params = Object.fromEntries(params);
113
  return url_params;
114
  }
115
- """
 
1
  custom_css = """
 
 
 
 
 
 
 
2
 
 
 
 
 
 
 
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
 
21
  transform: scale(1.3);
22
  }
23
 
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
  #search-bar-table-box > div:first-child {
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
40
 
41
+ /* Hides the final AutoEvalColumn */
42
+ #llm-benchmark-tab-table table td:last-child,
43
+ #llm-benchmark-tab-table table th:last-child {
44
+ display: none;
45
+ }
46
+
47
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
+ table td:first-child,
49
+ table th:first-child {
50
+ max-width: 400px;
51
+ overflow: auto;
52
+ white-space: nowrap;
53
+ }
54
+
55
  .tab-buttons button {
56
  font-size: 20px;
57
  }
58
 
59
+ #scale-logo {
60
+ border-style: none !important;
61
+ box-shadow: none;
62
+ display: block;
63
+ margin-left: auto;
64
+ margin-right: auto;
65
+ max-width: 600px;
66
+ }
67
+
68
+ #scale-logo .download {
69
+ display: none;
70
+ }
71
+ #filter_type{
72
  border: 0;
73
  padding-left: 0;
74
  padding-top: 0;
 
76
  #filter_type label {
77
  display: flex;
78
  }
79
+ #filter_type label > span{
80
  margin-top: var(--spacing-lg);
81
  margin-right: 0.5em;
82
  }
83
+ #filter_type label > .wrap{
84
  width: 103px;
85
  }
86
+ #filter_type label > .wrap .wrap-inner{
87
  padding: 2px;
88
  }
89
+ #filter_type label > .wrap .wrap-inner input{
90
+ width: 1px
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  }
92
+ #filter-columns-type{
93
+ border:0;
94
+ padding:0.5;
 
 
95
  }
96
+ #filter-columns-size{
97
+ border:0;
98
+ padding:0.5;
99
  }
100
+ #box-filter > .form{
101
+ border: 0
 
 
 
 
 
102
  }
103
  """
104
 
 
108
  url_params = Object.fromEntries(params);
109
  return url_params;
110
  }
111
+ """
src/assets/hardcoded_evals.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
+
3
+ gpt4_values = {
4
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
+ AutoEvalColumn.revision.name: "tech report",
6
+ AutoEvalColumn.precision.name: None,
7
+ AutoEvalColumn.average.name: 84.3,
8
+ AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
+ AutoEvalColumn.dummy.name: "GPT-4",
13
+ AutoEvalColumn.model_type.name: "",
14
+ }
15
+
16
+ gpt35_values = {
17
+ AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
18
+ AutoEvalColumn.revision.name: "tech report",
19
+ AutoEvalColumn.precision.name: None,
20
+ AutoEvalColumn.average.name: 71.9,
21
+ AutoEvalColumn.arc.name: 85.2,
22
+ AutoEvalColumn.hellaswag.name: 85.5,
23
+ AutoEvalColumn.mmlu.name: 70.0,
24
+ AutoEvalColumn.truthfulqa.name: 47.0,
25
+ AutoEvalColumn.dummy.name: "GPT-3.5",
26
+ AutoEvalColumn.model_type.name: "",
27
+ }
28
+
29
+ baseline = {
30
+ AutoEvalColumn.model.name: "<p>Baseline</p>",
31
+ AutoEvalColumn.revision.name: "N/A",
32
+ AutoEvalColumn.precision.name: None,
33
+ AutoEvalColumn.average.name: 25.0,
34
+ AutoEvalColumn.arc.name: 25.0,
35
+ AutoEvalColumn.hellaswag.name: 25.0,
36
+ AutoEvalColumn.mmlu.name: 25.0,
37
+ AutoEvalColumn.truthfulqa.name: 25.0,
38
+ AutoEvalColumn.dummy.name: "baseline",
39
+ AutoEvalColumn.model_type.name: "",
40
+ }
src/assets/scale-hf-logo.png ADDED

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB
src/{display/about.py → assets/text_content.py} RENAMED
@@ -1,64 +1,52 @@
1
- from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 style="text-align:left;float:left; id="space-title">🤗 Open LLM Leaderboard</h1> <h3 style="text-align:left;float:left;> Track, rank and evaluate open LLMs and chatbots </h3>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- The previous Leaderboard version is live [here](https://huggingface.co/spaces/open-llm-leaderboard-old/open_llm_leaderboard) 📊 Feeling lost? Check out our [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄
7
- \n You'll notably find explanations on the evaluations we are using, reproducibility guidelines, best practices on how to submit a model, and our FAQ.
8
- """
9
 
10
- icons = f"""
11
- - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given text corpora using masked modelling
12
- - {ModelType.CPT.to_str(" : ")} model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
13
- - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
14
- - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
15
- - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
16
  """
17
- LLM_BENCHMARKS_TEXT = """
18
- ## ABOUT
 
19
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
20
 
21
- 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
22
- The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
 
 
 
 
23
 
24
- ### Tasks
25
- 📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 
 
 
 
26
 
27
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
28
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
29
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
30
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA is technically a 6-shot task in the Harness because each example is prepended with 6 Q/A pairs, even in the 0-shot setting.
31
- - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
32
- - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
33
 
34
  For all these evaluations, a higher score is a better score.
35
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
36
 
37
- ### Results
38
  You can find:
39
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
40
- - details on the input/outputs for the models in the `details` of each model, which you can access by clicking the 📄 emoji after the model name
41
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
42
 
43
- If a model's name contains "Flagged", this indicates it has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
44
-
45
- ---------------------------
46
-
47
- ## REPRODUCIBILITY
48
- To reproduce our results, here are the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
49
- `python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
50
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
51
 
52
- ```
53
- python main.py --model=hf-causal-experimental \
54
- --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>" \
55
- --tasks=<task_list> \
56
- --num_fewshot=<n_few_shot> \
57
- --batch_size=1 \
58
- --output_path=<output_path>
59
- ```
60
-
61
- **Note:** We evaluate all models on a single node of 8 H100s, so the global batch size is 8 for each evaluation. If you don't use parallelism, adapt your batch size to fit.
62
  *You can expect results to vary slightly for different batch sizes because of padding.*
63
 
64
  The tasks and few shots parameters are:
@@ -66,122 +54,23 @@ The tasks and few shots parameters are:
66
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
67
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
68
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
69
- - Winogrande: 5-shot, *winogrande* (`acc`)
70
- - GSM8k: 5-shot, *gsm8k* (`acc`)
71
-
72
- Side note on the baseline scores:
73
- - for log-likelihood evaluation, we select the random baseline
74
- - for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
75
 
76
- ---------------------------
77
-
78
- ## RESOURCES
79
-
80
- ### Quantization
81
  To get more information about quantization, see:
82
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
83
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
84
 
85
- ### Useful links
86
- - [Community resources](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/174)
87
- - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
88
-
89
- ### Other cool leaderboards:
90
- - [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
91
- - [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
92
-
93
-
94
  """
95
 
96
- FAQ_TEXT = """
97
-
98
- ## SUBMISSIONS
99
- My model requires `trust_remote_code=True`, can I submit it?
100
- - *We only support models that have been integrated into a stable version of the `transformers` library for automatic submission, as we don't want to run possibly unsafe code on our cluster.*
101
-
102
- What about models of type X?
103
- - *We only support models that have been integrated into a stable version of the `transformers` library for automatic submission.*
104
-
105
- How can I follow when my model is launched?
106
- - *You can look for its request file [here](https://huggingface.co/datasets/open-llm-leaderboard/requests) and follow the status evolution, or directly in the queues above the submit form.*
107
-
108
- My model disappeared from all the queues, what happened?
109
- - *A model disappearing from all the queues usually means that there has been a failure. You can check if that is the case by looking for your model [here](https://huggingface.co/datasets/open-llm-leaderboard/requests).*
110
-
111
- What causes an evaluation failure?
112
- - *Most of the failures we get come from problems in the submissions (corrupted files, config problems, wrong parameters selected for eval ...), so we'll be grateful if you first make sure you have followed the steps in `About`. However, from time to time, we have failures on our side (hardware/node failures, problems with an update of our backend, connectivity problems ending up in the results not being saved, ...).*
113
-
114
- How can I report an evaluation failure?
115
- - *As we store the logs for all models, feel free to create an issue, **where you link to the requests file of your model** (look for it [here](https://huggingface.co/datasets/open-llm-leaderboard/requests/tree/main)), so we can investigate! If the model failed due to a problem on our side, we'll relaunch it right away!*
116
- *Note: Please do not re-upload your model under a different name, it will not help*
117
-
118
- ---------------------------
119
-
120
- ## RESULTS
121
- What kind of information can I find?
122
- - *Let's imagine you are interested in the Yi-34B results. You have access to 3 different information categories:*
123
- - *The [request file](https://huggingface.co/datasets/open-llm-leaderboard/requests/blob/main/01-ai/Yi-34B_eval_request_False_bfloat16_Original.json): it gives you information about the status of the evaluation*
124
- - *The [aggregated results folder](https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/01-ai/Yi-34B): it gives you aggregated scores, per experimental run*
125
- - *The [details dataset](https://huggingface.co/datasets/open-llm-leaderboard/details_01-ai__Yi-34B/tree/main): it gives you the full details (scores and examples for each task and a given model)*
126
-
127
-
128
- Why do models appear several times in the leaderboard?
129
- - *We run evaluations with user-selected precision and model commit. Sometimes, users submit specific models at different commits and at different precisions (for example, in float16 and 4bit to see how quantization affects performance). You should be able to verify this by displaying the `precision` and `model sha` columns in the display. If, however, you see models appearing several times with the same precision and hash commit, this is not normal.*
130
-
131
- What is this concept of "flagging"?
132
- - *This mechanism allows users to report models that have unfair performance on the leaderboard. This contains several categories: exceedingly good results on the leaderboard because the model was (maybe accidentally) trained on the evaluation data, models that are copies of other models not attributed properly, etc.*
133
-
134
- My model has been flagged improperly, what can I do?
135
- - *Every flagged model has a discussion associated with it - feel free to plead your case there, and we'll see what to do together with the community.*
136
-
137
- ---------------------------
138
-
139
- ## HOW TO SEARCH FOR A MODEL
140
- Search for models in the leaderboard by:
141
- 1. Name, e.g., *model_name*
142
- 2. Multiple names, separated by `;`, e.g., *model_name1;model_name2*
143
- 3. License, prefix with `Hub License:...`, e.g., *Hub License: MIT*
144
- 4. Combination of name and license, order is irrelevant, e.g., *model_name; Hub License: cc-by-sa-4.0*
145
-
146
- ---------------------------
147
-
148
- ## EDITING SUBMISSIONS
149
- I upgraded my model and want to re-submit, how can I do that?
150
- - *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*
151
-
152
- I need to rename my model, how can I do that?
153
- - *You can use @Weyaxi 's [super cool tool](https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-renamer) to request model name changes, then open a discussion where you link to the created pull request, and we'll check them and merge them as needed.*
154
-
155
- ---------------------------
156
-
157
- ## OTHER
158
- Why do you differentiate between pretrained, continuously pretrained, fine-tuned, merges, etc?
159
- - *These different models do not play in the same categories, and therefore need to be separated for fair comparison. Base pretrained models are the most interesting for the community, as they are usually good models to fine-tune later on - any jump in performance from a pretrained model represents a true improvement on the SOTA.
160
- Fine-tuned and IFT/RLHF/chat models usually have better performance, but the latter might be more sensitive to system prompts, which we do not cover at the moment in the Open LLM Leaderboard.
161
- Merges and moerges have artificially inflated performance on test sets, which is not always explainable, and does not always apply to real-world situations.*
162
-
163
- What should I use the leaderboard for?
164
- - *We recommend using the leaderboard for 3 use cases: 1) getting an idea of the state of open pretrained models, by looking only at the ranks and score of this category; 2) experimenting with different fine-tuning methods, datasets, quantization techniques, etc, and comparing their score in a reproducible setup, and 3) checking the performance of a model of interest to you, wrt to other models of its category.*
165
-
166
- Why don't you display closed-source model scores?
167
- - *This is a leaderboard for Open models, both for philosophical reasons (openness is cool) and for practical reasons: we want to ensure that the results we display are accurate and reproducible, but 1) commercial closed models can change their API thus rendering any scoring at a given time incorrect 2) we re-run everything on our cluster to ensure all models are run on the same setup and you can't do that for these models.*
168
-
169
- I have an issue with accessing the leaderboard through the Gradio API
170
- - *Since this is not the recommended way to access the leaderboard, we won't provide support for this, but you can look at tools provided by the community for inspiration!*
171
-
172
- I have another problem, help!
173
- - *Please open an issue in the discussion tab, and we'll do our best to help you in a timely manner :) *
174
- """
175
-
176
-
177
- EVALUATION_QUEUE_TEXT = f"""
178
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
179
 
180
  Models added here will be automatically evaluated on the 🤗 cluster.
181
 
182
- ## Don't forget to read the FAQ and the About tabs for more information!
183
-
184
- ## First steps before submitting a model
185
 
186
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
187
  ```python
@@ -204,32 +93,21 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
204
  ### 4) Fill up your model card
205
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
206
 
207
- ### 5) Select the correct precision
208
- Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
209
-
210
- <b>Note:</b> Please be advised that when submitting, git <b>branches</b> and <b>tags</b> will be strictly tied to the <b>specific commit</b> present at the time of submission. This ensures revision consistency.
211
- ## Model types
212
- {icons}
213
  """
214
 
215
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
216
  CITATION_BUTTON_TEXT = r"""
217
- @misc{open-llm-leaderboard-v2,
218
- author = {Clémentine Fourrier and Nathan Habib and Alina Lozovskaya and Konrad Szafer and Thomas Wolf},
219
- title = {Open LLM Leaderboard v2},
220
- year = {2024},
221
- publisher = {Hugging Face},
222
- howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}",
223
- }
224
-
225
- @misc{open-llm-leaderboard-v1,
226
- author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
227
- title = {Open LLM Leaderboard (2023-2024)},
228
  year = {2023},
229
  publisher = {Hugging Face},
230
- howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
231
  }
232
-
233
  @software{eval-harness,
234
  author = {Gao, Leo and
235
  Tow, Jonathan and
@@ -254,66 +132,37 @@ CITATION_BUTTON_TEXT = r"""
254
  publisher = {Zenodo},
255
  version = {v0.0.1},
256
  doi = {10.5281/zenodo.5371628},
257
- url = {https://doi.org/10.5281/zenodo.5371628},
258
- }
259
-
260
- @misc{zhou2023instructionfollowingevaluationlargelanguage,
261
- title={Instruction-Following Evaluation for Large Language Models},
262
- author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
263
- year={2023},
264
- eprint={2311.07911},
265
- archivePrefix={arXiv},
266
- primaryClass={cs.CL},
267
- url={https://arxiv.org/abs/2311.07911},
268
- }
269
-
270
- @misc{suzgun2022challengingbigbenchtaskschainofthought,
271
- title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
272
- author={Mirac Suzgun and Nathan Scales and Nathanael Schärli and Sebastian Gehrmann and Yi Tay and Hyung Won Chung and Aakanksha Chowdhery and Quoc V. Le and Ed H. Chi and Denny Zhou and Jason Wei},
273
- year={2022},
274
- eprint={2210.09261},
275
- archivePrefix={arXiv},
276
- primaryClass={cs.CL},
277
- url={https://arxiv.org/abs/2210.09261},
278
  }
279
-
280
- @misc{hendrycks2021measuringmathematicalproblemsolving,
281
- title={Measuring Mathematical Problem Solving With the MATH Dataset},
282
- author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
283
- year={2021},
284
- eprint={2103.03874},
285
- archivePrefix={arXiv},
286
- primaryClass={cs.LG},
287
- url={https://arxiv.org/abs/2103.03874},
288
  }
289
-
290
- @misc{rein2023gpqagraduatelevelgoogleproofqa,
291
- title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
292
- author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
293
- year={2023},
294
- eprint={2311.12022},
295
- archivePrefix={arXiv},
296
- primaryClass={cs.AI},
297
- url={https://arxiv.org/abs/2311.12022},
298
  }
299
-
300
- @misc{sprague2024musrtestinglimitschainofthought,
301
- title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning},
302
- author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
303
- year={2024},
304
- eprint={2310.16049},
305
- archivePrefix={arXiv},
306
- primaryClass={cs.CL},
307
- url={https://arxiv.org/abs/2310.16049},
308
  }
309
-
310
- @misc{wang2024mmluprorobustchallengingmultitask,
311
- title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
312
- author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
313
- year={2024},
314
- eprint={2406.01574},
315
- archivePrefix={arXiv},
316
- primaryClass={cs.CL},
317
- url={https://arxiv.org/abs/2406.01574},
318
- }
319
- """
 
1
+ from src.display_models.model_metadata_type import ModelType
2
 
3
+ TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
 
 
7
 
8
+ 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
+ The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 
 
 
 
10
  """
11
+
12
+ LLM_BENCHMARKS_TEXT = f"""
13
+ # Context
14
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
15
 
16
+ ## Icons
17
+ {ModelType.PT.to_str(" : ")} model
18
+ {ModelType.FT.to_str(" : ")} model
19
+ {ModelType.IFT.to_str(" : ")} model
20
+ {ModelType.RL.to_str(" : ")} model
21
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
22
 
23
+ 🏴‍☠️ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
24
+ (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
25
+
26
+ ## How it works
27
+
28
+ 📈 We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
30
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
31
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
32
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
33
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a models propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 
 
34
 
35
  For all these evaluations, a higher score is a better score.
36
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
+ ## Details and logs
39
  You can find:
40
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
41
+ - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
42
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
43
 
44
+ ## Reproducibility
45
+ To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
46
+ `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
47
+ ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
 
 
 
 
48
 
49
+ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
 
 
 
 
 
 
 
 
 
50
  *You can expect results to vary slightly for different batch sizes because of padding.*
51
 
52
  The tasks and few shots parameters are:
 
54
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
55
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
56
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 
 
 
 
 
 
57
 
58
+ ## Quantization
 
 
 
 
59
  To get more information about quantization, see:
60
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
61
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
62
 
63
+ ## More resources
64
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179)!
65
+ We also gather cool resources from the community, other teams, and other labs [here](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)!
 
 
 
 
 
 
66
  """
67
 
68
+ EVALUATION_QUEUE_TEXT = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Evaluation Queue for the 🤗 Open LLM Leaderboard
70
 
71
  Models added here will be automatically evaluated on the 🤗 cluster.
72
 
73
+ ## Some good practices before submitting a model
 
 
74
 
75
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
76
  ```python
 
93
  ### 4) Fill up your model card
94
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
95
 
96
+ ## In case of model failure
97
+ If your model is displayed in the `FAILED` category, its execution stopped.
98
+ Make sure you have followed the above steps first.
99
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
100
  """
101
 
102
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
103
  CITATION_BUTTON_TEXT = r"""
104
+ @misc{open-llm-leaderboard,
105
+ author = {Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
106
+ title = {Open LLM Leaderboard},
 
 
 
 
 
 
 
 
107
  year = {2023},
108
  publisher = {Hugging Face},
109
+ howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
110
  }
 
111
  @software{eval-harness,
112
  author = {Gao, Leo and
113
  Tow, Jonathan and
 
132
  publisher = {Zenodo},
133
  version = {v0.0.1},
134
  doi = {10.5281/zenodo.5371628},
135
+ url = {https://doi.org/10.5281/zenodo.5371628}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
+ @misc{clark2018think,
138
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
139
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
140
+ year={2018},
141
+ eprint={1803.05457},
142
+ archivePrefix={arXiv},
143
+ primaryClass={cs.AI}
 
 
144
  }
145
+ @misc{zellers2019hellaswag,
146
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
147
+ author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
148
+ year={2019},
149
+ eprint={1905.07830},
150
+ archivePrefix={arXiv},
151
+ primaryClass={cs.CL}
 
 
152
  }
153
+ @misc{hendrycks2021measuring,
154
+ title={Measuring Massive Multitask Language Understanding},
155
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
156
+ year={2021},
157
+ eprint={2009.03300},
158
+ archivePrefix={arXiv},
159
+ primaryClass={cs.CY}
 
 
160
  }
161
+ @misc{lin2022truthfulqa,
162
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
163
+ author={Stephanie Lin and Jacob Hilton and Owain Evans},
164
+ year={2022},
165
+ eprint={2109.07958},
166
+ archivePrefix={arXiv},
167
+ primaryClass={cs.CL}
168
+ }"""
 
 
 
src/display/formatting.py DELETED
@@ -1,36 +0,0 @@
1
- from huggingface_hub import HfApi
2
-
3
- API = HfApi()
4
-
5
-
6
- def model_hyperlink(link, model_name):
7
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
-
9
-
10
- def make_clickable_model(model_name):
11
- link = f"https://huggingface.co/{model_name}"
12
-
13
- details_model_name = model_name.replace("/", "__")
14
- details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/{details_model_name}-details"
15
-
16
- return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
17
-
18
-
19
- def styled_error(error):
20
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
21
-
22
-
23
- def styled_warning(warn):
24
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
25
-
26
-
27
- def styled_message(message):
28
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
29
-
30
-
31
- def has_no_nan_values(df, columns):
32
- return df[columns].notna().all(axis=1)
33
-
34
-
35
- def has_nan_values(df, columns):
36
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,260 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
- import json
4
- import logging
5
- from datetime import datetime
6
- import pandas as pd
7
-
8
-
9
- # Configure logging
10
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
-
12
- # Convert ISO 8601 dates to datetime objects for comparison
13
- def parse_iso8601_datetime(date_str):
14
- if date_str.endswith('Z'):
15
- date_str = date_str[:-1] + '+00:00'
16
- return datetime.fromisoformat(date_str)
17
-
18
- def parse_datetime(datetime_str):
19
- formats = [
20
- "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
21
- "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
22
- "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
23
- ]
24
-
25
- for fmt in formats:
26
- try:
27
- return datetime.strptime(datetime_str, fmt)
28
- except ValueError:
29
- continue
30
- # in rare cases set unix start time for files with incorrect time (legacy files)
31
- logging.error(f"No valid date format found for: {datetime_str}")
32
- return datetime(1970, 1, 1)
33
-
34
-
35
- def load_json_data(file_path):
36
- """Safely load JSON data from a file."""
37
- try:
38
- with open(file_path, "r") as file:
39
- return json.load(file)
40
- except json.JSONDecodeError:
41
- print(f"Error reading JSON from {file_path}")
42
- return None # Or raise an exception
43
-
44
-
45
- def fields(raw_class):
46
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
47
-
48
-
49
- @dataclass
50
- class Task:
51
- benchmark: str
52
- metric: str
53
- col_name: str
54
-
55
-
56
- class Tasks(Enum):
57
- ifeval = Task("leaderboard_ifeval", "strict_acc,none", "IFEval")
58
- ifeval_raw = Task("leaderboard_ifeval", "strict_acc,none", "IFEval Raw")
59
-
60
- bbh = Task("leaderboard_bbh", "acc_norm,none", "BBH")
61
- bbh_raw = Task("leaderboard_bbh", "acc_norm,none", "BBH Raw")
62
-
63
- math = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5")
64
- math_raw = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5 Raw")
65
-
66
- gpqa = Task("leaderboard_gpqa", "acc_norm,none", "GPQA")
67
- gpqa_raw = Task("leaderboard_gpqa", "acc_norm,none", "GPQA Raw")
68
-
69
- musr = Task("leaderboard_musr", "acc_norm,none", "MUSR")
70
- musr_raw = Task("leaderboard_musr", "acc_norm,none", "MUSR Raw")
71
-
72
- mmlu_pro = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO")
73
- mmlu_pro_raw = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO Raw")
74
-
75
-
76
- # These classes are for user facing column names,
77
- # to avoid having to change them all around the code
78
- # when a modif is needed
79
- @dataclass(frozen=True)
80
- class ColumnContent:
81
- name: str
82
- type: str
83
- displayed_by_default: bool
84
- hidden: bool = False
85
- never_hidden: bool = False
86
- dummy: bool = False
87
-
88
-
89
- auto_eval_column_dict = []
90
- # Init
91
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
92
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
93
- # Scores
94
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
95
- for task in Tasks:
96
- displayed_by_default = not task.name.endswith("_raw")
97
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default=displayed_by_default)])
98
- # Model information
99
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
100
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
101
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
102
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
103
- auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Not_Merged", "bool", False)])
104
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
105
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
106
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
107
- auto_eval_column_dict.append(
108
- ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
109
- )
110
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
111
- auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
112
- auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
113
- # to rename
114
- auto_eval_column_dict.append(["submission_date", ColumnContent, ColumnContent("submission_date", "date", False, hidden=True)])
115
- auto_eval_column_dict.append(["upload_to_hub", ColumnContent, ColumnContent("upload_to_hub", "date", False, hidden=True)])
116
-
117
- auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Chat Template", "bool", False)])
118
- auto_eval_column_dict.append(["maintainers_highlight", ColumnContent, ColumnContent("Maintainer's Highlight", "bool", False, hidden=True)])
119
-
120
- # fullname structure: <user>/<model_name>
121
- auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
122
-
123
- # We use make dataclass to dynamically fill the scores from Tasks
124
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
125
-
126
-
127
- @dataclass(frozen=True)
128
- class EvalQueueColumn: # Queue column
129
- model_link = ColumnContent("model_link", "markdown", True)
130
- model_name = ColumnContent("model_name", "str", True)
131
- revision = ColumnContent("revision", "str", True)
132
- #private = ColumnContent("private", "bool", True) # Should not be displayed
133
- precision = ColumnContent("precision", "str", True)
134
- #weight_type = ColumnContent("weight_type", "str", "Original") # Might be confusing, to think about
135
- status = ColumnContent("status", "str", True)
136
-
137
-
138
- # baseline_row = {
139
- # AutoEvalColumn.model.name: "<p>Baseline</p>",
140
- # AutoEvalColumn.revision.name: "N/A",
141
- # AutoEvalColumn.precision.name: None,
142
- # AutoEvalColumn.merged.name: False,
143
- # AutoEvalColumn.average.name: 31.0,
144
- # AutoEvalColumn.arc.name: 25.0,
145
- # AutoEvalColumn.hellaswag.name: 25.0,
146
- # AutoEvalColumn.mmlu.name: 25.0,
147
- # AutoEvalColumn.truthfulqa.name: 25.0,
148
- # AutoEvalColumn.winogrande.name: 50.0,
149
- # AutoEvalColumn.gsm8k.name: 0.21,
150
- # AutoEvalColumn.fullname.name: "baseline",
151
- # AutoEvalColumn.model_type.name: "",
152
- # AutoEvalColumn.not_flagged.name: False,
153
- # }
154
-
155
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
156
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
157
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
158
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
159
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
160
- # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
161
- # GSM8K: paper
162
- # Define the human baselines
163
- # human_baseline_row = {
164
- # AutoEvalColumn.model.name: "<p>Human performance</p>",
165
- # AutoEvalColumn.revision.name: "N/A",
166
- # AutoEvalColumn.precision.name: None,
167
- # AutoEvalColumn.average.name: 92.75,
168
- # AutoEvalColumn.merged.name: False,
169
- # AutoEvalColumn.arc.name: 80.0,
170
- # AutoEvalColumn.hellaswag.name: 95.0,
171
- # AutoEvalColumn.mmlu.name: 89.8,
172
- # AutoEvalColumn.truthfulqa.name: 94.0,
173
- # AutoEvalColumn.winogrande.name: 94.0,
174
- # AutoEvalColumn.gsm8k.name: 100,
175
- # AutoEvalColumn.fullname.name: "human_baseline",
176
- # AutoEvalColumn.model_type.name: "",
177
- # AutoEvalColumn.not_flagged.name: False,
178
- # }
179
-
180
-
181
- @dataclass
182
- class ModelDetails:
183
- name: str
184
- symbol: str = "" # emoji, only for the model type
185
-
186
-
187
- class ModelType(Enum):
188
- PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
189
- CPT = ModelDetails(name="🟩 continuously pretrained", symbol="🟩")
190
- FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
191
- chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
192
- merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
193
- Unknown = ModelDetails(name="❓ other", symbol="❓")
194
-
195
- def to_str(self, separator=" "):
196
- return f"{self.value.symbol}{separator}{self.value.name}"
197
-
198
- @staticmethod
199
- def from_str(m_type):
200
- if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
201
- return ModelType.FT
202
- if "continuously pretrained" in m_type or "🟩" in m_type:
203
- return ModelType.CPT
204
- if "pretrained" in m_type or "🟢" in m_type:
205
- return ModelType.PT
206
- if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
207
- return ModelType.chat
208
- if "merge" in m_type or "🤝" in m_type:
209
- return ModelType.merges
210
- return ModelType.Unknown
211
-
212
-
213
- class WeightType(Enum):
214
- Adapter = ModelDetails("Adapter")
215
- Original = ModelDetails("Original")
216
- Delta = ModelDetails("Delta")
217
-
218
-
219
- class Precision(Enum):
220
- float16 = ModelDetails("float16")
221
- bfloat16 = ModelDetails("bfloat16")
222
- qt_8bit = ModelDetails("8bit")
223
- qt_4bit = ModelDetails("4bit")
224
- qt_GPTQ = ModelDetails("GPTQ")
225
- Unknown = ModelDetails("?")
226
-
227
- @staticmethod
228
- def from_str(precision):
229
- if precision in ["torch.float16", "float16"]:
230
- return Precision.float16
231
- if precision in ["torch.bfloat16", "bfloat16"]:
232
- return Precision.bfloat16
233
- if precision in ["8bit"]:
234
- return Precision.qt_8bit
235
- if precision in ["4bit"]:
236
- return Precision.qt_4bit
237
- if precision in ["GPTQ", "None"]:
238
- return Precision.qt_GPTQ
239
- return Precision.Unknown
240
-
241
-
242
- # Column selection
243
- COLS = [c.name for c in fields(AutoEvalColumn)]
244
- TYPES = [c.type for c in fields(AutoEvalColumn)]
245
-
246
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
247
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
248
-
249
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
250
-
251
- NUMERIC_INTERVALS = {
252
- "?": pd.Interval(-1, 0, closed="right"),
253
- "~1.5": pd.Interval(0, 2, closed="right"),
254
- "~3": pd.Interval(2, 4, closed="right"),
255
- "~7": pd.Interval(4, 9, closed="right"),
256
- "~13": pd.Interval(9, 20, closed="right"),
257
- "~35": pd.Interval(20, 45, closed="right"),
258
- "~60": pd.Interval(45, 70, closed="right"),
259
- "70+": pd.Interval(70, 10000, closed="right"),
260
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display_models/get_model_metadata.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import os
4
+ import re
5
+ import pickle
6
+ from typing import List
7
+
8
+ import huggingface_hub
9
+ from huggingface_hub import HfApi
10
+ from tqdm import tqdm
11
+ from transformers import AutoModel, AutoConfig
12
+ from accelerate import init_empty_weights
13
+
14
+ from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
15
+ from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
16
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
17
+
18
+ api = HfApi(token=os.environ.get("H4_TOKEN", None))
19
+
20
+
21
+ def get_model_infos_from_hub(leaderboard_data: List[dict]):
22
+ # load cache from disk
23
+ try:
24
+ with open("model_info_cache.pkl", "rb") as f:
25
+ model_info_cache = pickle.load(f)
26
+ except (EOFError, FileNotFoundError):
27
+ model_info_cache = {}
28
+ try:
29
+ with open("model_size_cache.pkl", "rb") as f:
30
+ model_size_cache = pickle.load(f)
31
+ except (EOFError, FileNotFoundError):
32
+ model_size_cache = {}
33
+
34
+ for model_data in tqdm(leaderboard_data):
35
+ model_name = model_data["model_name_for_query"]
36
+
37
+ if model_name in model_info_cache:
38
+ model_info = model_info_cache[model_name]
39
+ else:
40
+ try:
41
+ model_info = api.model_info(model_name)
42
+ model_info_cache[model_name] = model_info
43
+ except huggingface_hub.utils._errors.RepositoryNotFoundError:
44
+ print("Repo not found!", model_name)
45
+ model_data[AutoEvalColumn.license.name] = None
46
+ model_data[AutoEvalColumn.likes.name] = None
47
+ if model_name not in model_size_cache:
48
+ model_size_cache[model_name] = get_model_size(model_name, None)
49
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
50
+
51
+ model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
52
+ model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
53
+ if model_name not in model_size_cache:
54
+ model_size_cache[model_name] = get_model_size(model_name, model_info)
55
+ model_data[AutoEvalColumn.params.name] = model_size_cache[model_name]
56
+
57
+ # save cache to disk in pickle format
58
+ with open("model_info_cache.pkl", "wb") as f:
59
+ pickle.dump(model_info_cache, f)
60
+ with open("model_size_cache.pkl", "wb") as f:
61
+ pickle.dump(model_size_cache, f)
62
+
63
+
64
+ def get_model_license(model_info):
65
+ try:
66
+ return model_info.cardData["license"]
67
+ except Exception:
68
+ return "?"
69
+
70
+
71
+ def get_model_likes(model_info):
72
+ return model_info.likes
73
+
74
+
75
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
76
+
77
+
78
+ def get_model_size(model_name, model_info):
79
+ # In billions
80
+ try:
81
+ return round(model_info.safetensors["total"] / 1e9, 3)
82
+ except AttributeError:
83
+ try:
84
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=False)
85
+ with init_empty_weights():
86
+ model = AutoModel.from_config(config, trust_remote_code=False)
87
+ return round(sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9, 3)
88
+ except (EnvironmentError, ValueError, KeyError): # model config not found, likely private
89
+ try:
90
+ size_match = re.search(size_pattern, model_name.lower())
91
+ size = size_match.group(0)
92
+ return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
93
+ except AttributeError:
94
+ return 0
95
+
96
+
97
+ def get_model_type(leaderboard_data: List[dict]):
98
+ for model_data in leaderboard_data:
99
+ request_files = os.path.join(
100
+ "eval-queue",
101
+ model_data["model_name_for_query"] + "_eval_request_*" + ".json",
102
+ )
103
+ request_files = glob.glob(request_files)
104
+
105
+ # Select correct request file (precision)
106
+ request_file = ""
107
+ if len(request_files) == 1:
108
+ request_file = request_files[0]
109
+ elif len(request_files) > 1:
110
+ request_files = sorted(request_files, reverse=True)
111
+ for tmp_request_file in request_files:
112
+ with open(tmp_request_file, "r") as f:
113
+ req_content = json.load(f)
114
+ if (
115
+ req_content["status"] == "FINISHED"
116
+ and req_content["precision"] == model_data["Precision"].split(".")[-1]
117
+ ):
118
+ request_file = tmp_request_file
119
+
120
+ try:
121
+ with open(request_file, "r") as f:
122
+ request = json.load(f)
123
+ model_type = model_type_from_str(request["model_type"])
124
+ model_data[AutoEvalColumn.model_type.name] = model_type.value.name
125
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
126
+ except Exception:
127
+ if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
128
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
129
+ model_data["model_name_for_query"]
130
+ ].value.name
131
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
132
+ model_data["model_name_for_query"]
133
+ ].value.symbol # + ("🔺" if is_delta else "")
134
+ else:
135
+ model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
136
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
137
+
138
+
139
+ def flag_models(leaderboard_data: List[dict]):
140
+ for model_data in leaderboard_data:
141
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
142
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
143
+ issue_link = model_hyperlink(
144
+ FLAGGED_MODELS[model_data["model_name_for_query"]],
145
+ f"See discussion #{issue_num}",
146
+ )
147
+ model_data[
148
+ AutoEvalColumn.model.name
149
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
150
+
151
+
152
+ def remove_forbidden_models(leaderboard_data: List[dict]):
153
+ indices_to_remove = []
154
+ for ix, model in enumerate(leaderboard_data):
155
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
156
+ indices_to_remove.append(ix)
157
+
158
+ for ix in reversed(indices_to_remove):
159
+ leaderboard_data.pop(ix)
160
+ return leaderboard_data
161
+
162
+
163
+ def apply_metadata(leaderboard_data: List[dict]):
164
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
165
+ get_model_type(leaderboard_data)
166
+ get_model_infos_from_hub(leaderboard_data)
167
+ flag_models(leaderboard_data)
src/display_models/model_metadata_flags.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models which have been flagged by users as being problematic for a reason or another
2
+ # (Model name to forum discussion link)
3
+ FLAGGED_MODELS = {
4
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
+ "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
8
+ "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
9
+ "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
10
+ "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
11
+ "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
12
+ "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
13
+ }
14
+
15
+ # Models which have been requested by orgs to not be submitted on the leaderboard
16
+ DO_NOT_SUBMIT_MODELS = [
17
+ "Voicelab/trurl-2-13b", # trained on MMLU
18
+ ]
src/display_models/model_metadata_type.py ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict
4
+
5
+
6
+ @dataclass
7
+ class ModelInfo:
8
+ name: str
9
+ symbol: str # emoji
10
+
11
+
12
+ class ModelType(Enum):
13
+ PT = ModelInfo(name="pretrained", symbol="🟢")
14
+ FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
+ IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
+ RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
+ Unknown = ModelInfo(name="Unknown", symbol="?")
18
+
19
+ def to_str(self, separator=" "):
20
+ return f"{self.value.symbol}{separator}{self.value.name}"
21
+
22
+
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
+ "tiiuae/falcon-180B": ModelType.PT,
25
+ "tiiuae/falcon-180B-chat": ModelType.RL,
26
+ "microsoft/phi-1_5": ModelType.PT,
27
+ "Qwen/Qwen-7B": ModelType.PT,
28
+ "Qwen/Qwen-7B-Chat": ModelType.RL,
29
+ "notstoic/PygmalionCoT-7b": ModelType.IFT,
30
+ "aisquared/dlite-v1-355m": ModelType.IFT,
31
+ "aisquared/dlite-v1-1_5b": ModelType.IFT,
32
+ "aisquared/dlite-v1-774m": ModelType.IFT,
33
+ "aisquared/dlite-v1-124m": ModelType.IFT,
34
+ "aisquared/chopt-2_7b": ModelType.IFT,
35
+ "aisquared/dlite-v2-124m": ModelType.IFT,
36
+ "aisquared/dlite-v2-774m": ModelType.IFT,
37
+ "aisquared/dlite-v2-1_5b": ModelType.IFT,
38
+ "aisquared/chopt-1_3b": ModelType.IFT,
39
+ "aisquared/dlite-v2-355m": ModelType.IFT,
40
+ "augtoma/qCammel-13": ModelType.IFT,
41
+ "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
42
+ "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
43
+ "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
44
+ "TheBloke/tulu-7B-fp16": ModelType.IFT,
45
+ "TheBloke/guanaco-7B-HF": ModelType.FT,
46
+ "TheBloke/koala-7B-HF": ModelType.FT,
47
+ "TheBloke/wizardLM-7B-HF": ModelType.IFT,
48
+ "TheBloke/airoboros-13B-HF": ModelType.IFT,
49
+ "TheBloke/koala-13B-HF": ModelType.FT,
50
+ "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
51
+ "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
52
+ "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
53
+ "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
54
+ "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
55
+ "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
56
+ "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
57
+ "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
58
+ "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
59
+ "TheBloke/guanaco-13B-HF": ModelType.FT,
60
+ "TheBloke/guanaco-65B-HF": ModelType.FT,
61
+ "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
62
+ "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
63
+ "TheBloke/Llama-2-13B-fp16": ModelType.PT,
64
+ "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
65
+ "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
66
+ "TheBloke/Planner-7B-fp16": ModelType.IFT,
67
+ "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
68
+ "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
69
+ "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
70
+ "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
71
+ "TheBloke/tulu-13B-fp16": ModelType.IFT,
72
+ "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
73
+ "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
74
+ "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
75
+ "TheBloke/robin-13B-v2-fp16": ModelType.FT,
76
+ "TheBloke/robin-33B-v2-fp16": ModelType.FT,
77
+ "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
78
+ "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
79
+ "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
80
+ "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
81
+ "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
82
+ "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
83
+ "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
84
+ "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
85
+ "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
86
+ "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
87
+ "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
88
+ "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
89
+ "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
90
+ "concedo/OPT-19M-ChatSalad": ModelType.FT,
91
+ "concedo/Pythia-70M-ChatSalad": ModelType.FT,
92
+ "digitous/13B-HyperMantis": ModelType.IFT,
93
+ "digitous/Adventien-GPTJ": ModelType.FT,
94
+ "digitous/Alpacino13b": ModelType.IFT,
95
+ "digitous/GPT-R": ModelType.IFT,
96
+ "digitous/Javelin-R": ModelType.IFT,
97
+ "digitous/Javalion-GPTJ": ModelType.IFT,
98
+ "digitous/Javalion-R": ModelType.IFT,
99
+ "digitous/Skegma-GPTJ": ModelType.FT,
100
+ "digitous/Alpacino30b": ModelType.IFT,
101
+ "digitous/Janin-GPTJ": ModelType.FT,
102
+ "digitous/Janin-R": ModelType.FT,
103
+ "digitous/Javelin-GPTJ": ModelType.FT,
104
+ "SaylorTwift/gpt2_test": ModelType.PT,
105
+ "anton-l/gpt-j-tiny-random": ModelType.FT,
106
+ "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
107
+ "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
108
+ "Lazycuber/Janemalion-6B": ModelType.FT,
109
+ "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
110
+ "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
111
+ "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
112
+ "gpt2-medium": ModelType.PT,
113
+ "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
114
+ "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
115
+ "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
116
+ "PygmalionAI/pygmalion-6b": ModelType.FT,
117
+ "PygmalionAI/metharme-1.3b": ModelType.IFT,
118
+ "PygmalionAI/pygmalion-1.3b": ModelType.FT,
119
+ "PygmalionAI/pygmalion-350m": ModelType.FT,
120
+ "PygmalionAI/pygmalion-2.7b": ModelType.FT,
121
+ "medalpaca/medalpaca-7b": ModelType.FT,
122
+ "lilloukas/Platypus-30B": ModelType.IFT,
123
+ "lilloukas/GPlatty-30B": ModelType.FT,
124
+ "mncai/chatdoctor": ModelType.FT,
125
+ "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
126
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
127
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
128
+ "hakurei/instruct-12b": ModelType.IFT,
129
+ "hakurei/lotus-12B": ModelType.FT,
130
+ "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
131
+ "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
132
+ "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
133
+ "mosaicml/mpt-7b-instruct": ModelType.IFT,
134
+ "mosaicml/mpt-30b-chat": ModelType.IFT,
135
+ "mosaicml/mpt-7b-storywriter": ModelType.FT,
136
+ "mosaicml/mpt-30b-instruct": ModelType.IFT,
137
+ "mosaicml/mpt-7b-chat": ModelType.IFT,
138
+ "mosaicml/mpt-30b": ModelType.PT,
139
+ "Corianas/111m": ModelType.IFT,
140
+ "Corianas/Quokka_1.3b": ModelType.IFT,
141
+ "Corianas/256_5epoch": ModelType.FT,
142
+ "Corianas/Quokka_256m": ModelType.IFT,
143
+ "Corianas/Quokka_590m": ModelType.IFT,
144
+ "Corianas/gpt-j-6B-Dolly": ModelType.FT,
145
+ "Corianas/Quokka_2.7b": ModelType.IFT,
146
+ "cyberagent/open-calm-7b": ModelType.FT,
147
+ "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
148
+ "THUDM/chatglm2-6b": ModelType.IFT,
149
+ "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
150
+ "NYTK/PULI-GPTrio": ModelType.PT,
151
+ "EleutherAI/pythia-1.3b": ModelType.PT,
152
+ "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
153
+ "EleutherAI/gpt-neo-125m": ModelType.PT,
154
+ "EleutherAI/pythia-160m": ModelType.PT,
155
+ "EleutherAI/gpt-neo-2.7B": ModelType.PT,
156
+ "EleutherAI/pythia-1b-deduped": ModelType.PT,
157
+ "EleutherAI/pythia-6.7b": ModelType.PT,
158
+ "EleutherAI/pythia-70m-deduped": ModelType.PT,
159
+ "EleutherAI/gpt-neox-20b": ModelType.PT,
160
+ "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
161
+ "EleutherAI/pythia-2.7b": ModelType.PT,
162
+ "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
163
+ "EleutherAI/pythia-70m": ModelType.PT,
164
+ "EleutherAI/gpt-j-6b": ModelType.PT,
165
+ "EleutherAI/pythia-12b-deduped": ModelType.PT,
166
+ "EleutherAI/gpt-neo-1.3B": ModelType.PT,
167
+ "EleutherAI/pythia-410m-deduped": ModelType.PT,
168
+ "EleutherAI/pythia-160m-deduped": ModelType.PT,
169
+ "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
170
+ "EleutherAI/pythia-12b": ModelType.PT,
171
+ "roneneldan/TinyStories-33M": ModelType.PT,
172
+ "roneneldan/TinyStories-28M": ModelType.PT,
173
+ "roneneldan/TinyStories-1M": ModelType.PT,
174
+ "roneneldan/TinyStories-8M": ModelType.PT,
175
+ "roneneldan/TinyStories-3M": ModelType.PT,
176
+ "jerryjalapeno/nart-100k-7b": ModelType.FT,
177
+ "lmsys/vicuna-13b-v1.3": ModelType.IFT,
178
+ "lmsys/vicuna-7b-v1.3": ModelType.IFT,
179
+ "lmsys/vicuna-13b-v1.1": ModelType.IFT,
180
+ "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
181
+ "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
182
+ "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
183
+ "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
184
+ "Gryphe/MythoLogic-13b": ModelType.IFT,
185
+ "Gryphe/MythoBoros-13b": ModelType.IFT,
186
+ "pillowtalks-ai/delta13b": ModelType.FT,
187
+ "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
188
+ "bigscience/bloom-7b1": ModelType.PT,
189
+ "bigcode/tiny_starcoder_py": ModelType.PT,
190
+ "bigcode/starcoderplus": ModelType.FT,
191
+ "bigcode/gpt_bigcode-santacoder": ModelType.PT,
192
+ "bigcode/starcoder": ModelType.PT,
193
+ "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
194
+ "microsoft/DialoGPT-large": ModelType.FT,
195
+ "microsoft/DialoGPT-small": ModelType.FT,
196
+ "microsoft/DialoGPT-medium": ModelType.FT,
197
+ "microsoft/CodeGPT-small-py": ModelType.FT,
198
+ "Tincando/fiction_story_generator": ModelType.FT,
199
+ "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
200
+ "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
201
+ "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
202
+ "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
203
+ "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
204
+ "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
205
+ "illuin/test-custom-llama": ModelType.FT,
206
+ "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
207
+ "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
208
+ "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
209
+ "dvruette/llama-13b-pretrained": ModelType.PT,
210
+ "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
211
+ "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
212
+ "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
213
+ "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
214
+ "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
215
+ "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
216
+ "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
217
+ "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
218
+ "openlm-research/open_llama_7b": ModelType.PT,
219
+ "openlm-research/open_llama_7b_v2": ModelType.PT,
220
+ "openlm-research/open_llama_3b": ModelType.PT,
221
+ "openlm-research/open_llama_13b": ModelType.PT,
222
+ "openlm-research/open_llama_3b_v2": ModelType.PT,
223
+ "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
224
+ "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
225
+ "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
226
+ "databricks/dolly-v2-7b": ModelType.IFT,
227
+ "databricks/dolly-v2-3b": ModelType.IFT,
228
+ "databricks/dolly-v2-12b": ModelType.IFT,
229
+ "Rachneet/gpt2-xl-alpaca": ModelType.FT,
230
+ "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
231
+ "psyche/kogpt": ModelType.FT,
232
+ "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
233
+ "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
234
+ "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
235
+ "Fredithefish/CrimsonPajama": ModelType.IFT,
236
+ "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
237
+ "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
238
+ "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
239
+ "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
240
+ "eachadea/vicuna-13b-1.1": ModelType.FT,
241
+ "eachadea/vicuna-7b-1.1": ModelType.FT,
242
+ "eachadea/vicuna-13b": ModelType.FT,
243
+ "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
244
+ "openaccess-ai-collective/manticore-13b": ModelType.IFT,
245
+ "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
246
+ "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
247
+ "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
248
+ "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
249
+ "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
250
+ "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
251
+ "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
252
+ "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
253
+ "stabilityai/StableBeluga1-Delta": ModelType.IFT,
254
+ "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
255
+ "stabilityai/StableBeluga2": ModelType.IFT,
256
+ "stabilityai/StableBeluga-13B": ModelType.IFT,
257
+ "stabilityai/StableBeluga-7B": ModelType.IFT,
258
+ "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
259
+ "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
260
+ "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
261
+ "alibidaran/medical_transcription_generator": ModelType.FT,
262
+ "CalderaAI/30B-Lazarus": ModelType.IFT,
263
+ "CalderaAI/13B-BlueMethod": ModelType.IFT,
264
+ "CalderaAI/13B-Ouroboros": ModelType.IFT,
265
+ "KoboldAI/OPT-13B-Erebus": ModelType.FT,
266
+ "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
267
+ "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
268
+ "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
269
+ "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
270
+ "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
271
+ "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
272
+ "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
273
+ "KoboldAI/fairseq-dense-125M": ModelType.PT,
274
+ "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
275
+ "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
276
+ "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
277
+ "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
278
+ "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
279
+ "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
280
+ "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
281
+ "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
282
+ "KoboldAI/fairseq-dense-355M": ModelType.PT,
283
+ "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
284
+ "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
285
+ "KoboldAI/OPT-350M-Erebus": ModelType.FT,
286
+ "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
287
+ "KoboldAI/OPT-30B-Erebus": ModelType.FT,
288
+ "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
289
+ "klosax/open_llama_3b_350bt_preview": ModelType.PT,
290
+ "klosax/openllama-3b-350bt": ModelType.PT,
291
+ "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
292
+ "klosax/open_llama_13b_600bt_preview": ModelType.PT,
293
+ "klosax/open_llama_7b_400bt_preview": ModelType.PT,
294
+ "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
295
+ "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
296
+ "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
297
+ "TFLai/gpt2-turkish-uncased": ModelType.FT,
298
+ "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
299
+ "ehartford/dolphin-llama-13b": ModelType.IFT,
300
+ "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
301
+ "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
302
+ "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
303
+ "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
304
+ "ehartford/based-30b": ModelType.FT,
305
+ "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
306
+ "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
307
+ "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
308
+ "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
309
+ "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
310
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
311
+ "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
312
+ "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
313
+ "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
314
+ "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
315
+ "junelee/wizard-vicuna-13b": ModelType.FT,
316
+ "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
317
+ "BreadAi/MuseCan": ModelType.PT,
318
+ "BreadAi/MusePy-1-2": ModelType.PT,
319
+ "BreadAi/DiscordPy": ModelType.PT,
320
+ "BreadAi/PM_modelV2": ModelType.PT,
321
+ "BreadAi/gpt-Youtube": ModelType.PT,
322
+ "BreadAi/StoryPy": ModelType.FT,
323
+ "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
324
+ "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
325
+ "AGI-inc/lora_moe_7b": ModelType.FT,
326
+ "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
327
+ "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
328
+ "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
329
+ "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
330
+ "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
331
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
332
+ "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
333
+ "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
334
+ "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
335
+ "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
336
+ "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
337
+ "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
338
+ "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
339
+ "Writer/camel-5b-hf": ModelType.IFT,
340
+ "Writer/palmyra-base": ModelType.PT,
341
+ "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
342
+ "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
343
+ "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
344
+ "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
345
+ "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
346
+ "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
347
+ "MBZUAI/lamini-neo-125m": ModelType.IFT,
348
+ "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
349
+ "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
350
+ "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
351
+ "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
352
+ "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
353
+ "TehVenom/Dolly_Malion-6b": ModelType.FT,
354
+ "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
355
+ "TehVenom/ChanMalion": ModelType.FT,
356
+ "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
357
+ "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
358
+ "TehVenom/Metharme-13b-Merged": ModelType.IFT,
359
+ "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
360
+ "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
361
+ "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
362
+ "vicgalle/gpt2-alpaca": ModelType.IFT,
363
+ "vicgalle/alpaca-7b": ModelType.FT,
364
+ "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
365
+ "facebook/opt-350m": ModelType.PT,
366
+ "facebook/opt-125m": ModelType.PT,
367
+ "facebook/xglm-4.5B": ModelType.PT,
368
+ "facebook/opt-2.7b": ModelType.PT,
369
+ "facebook/opt-6.7b": ModelType.PT,
370
+ "facebook/galactica-30b": ModelType.PT,
371
+ "facebook/opt-13b": ModelType.PT,
372
+ "facebook/opt-66b": ModelType.PT,
373
+ "facebook/xglm-7.5B": ModelType.PT,
374
+ "facebook/xglm-564M": ModelType.PT,
375
+ "facebook/opt-30b": ModelType.PT,
376
+ "golaxy/gogpt-7b": ModelType.FT,
377
+ "golaxy/gogpt2-7b": ModelType.FT,
378
+ "golaxy/gogpt-7b-bloom": ModelType.FT,
379
+ "golaxy/gogpt-3b-bloom": ModelType.FT,
380
+ "psmathur/orca_mini_v2_7b": ModelType.IFT,
381
+ "psmathur/orca_mini_7b": ModelType.IFT,
382
+ "psmathur/orca_mini_3b": ModelType.IFT,
383
+ "psmathur/orca_mini_v2_13b": ModelType.IFT,
384
+ "gpt2-xl": ModelType.PT,
385
+ "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
386
+ "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
387
+ "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
388
+ "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
389
+ "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
390
+ "jzjiao/opt-1.3b-rlhf": ModelType.FT,
391
+ "HuggingFaceH4/starchat-beta": ModelType.IFT,
392
+ "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
393
+ "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
394
+ "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
395
+ "openchat/openchat_8192": ModelType.IFT,
396
+ "openchat/openchat_v2": ModelType.IFT,
397
+ "openchat/openchat_v2_w": ModelType.IFT,
398
+ "ausboss/llama-13b-supercot": ModelType.IFT,
399
+ "ausboss/llama-30b-supercot": ModelType.IFT,
400
+ "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
401
+ "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
402
+ "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
403
+ "victor123/WizardLM-13B-1.0": ModelType.IFT,
404
+ "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
405
+ "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
406
+ "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
407
+ "baichuan-inc/Baichuan-7B": ModelType.PT,
408
+ "tiiuae/falcon-40b-instruct": ModelType.IFT,
409
+ "tiiuae/falcon-40b": ModelType.PT,
410
+ "tiiuae/falcon-7b": ModelType.PT,
411
+ "YeungNLP/firefly-llama-13b": ModelType.FT,
412
+ "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
413
+ "YeungNLP/firefly-llama2-13b": ModelType.FT,
414
+ "YeungNLP/firefly-ziya-13b": ModelType.FT,
415
+ "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
416
+ "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
417
+ "xzuyn/MedicWizard-7B": ModelType.FT,
418
+ "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
419
+ "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
420
+ "beomi/llama-2-ko-7b": ModelType.IFT,
421
+ "Salesforce/codegen-6B-multi": ModelType.PT,
422
+ "Salesforce/codegen-16B-nl": ModelType.PT,
423
+ "Salesforce/codegen-6B-nl": ModelType.PT,
424
+ "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
425
+ "gpt2-large": ModelType.PT,
426
+ "frank098/orca_mini_3b_juniper": ModelType.FT,
427
+ "frank098/WizardLM_13B_juniper": ModelType.FT,
428
+ "FPHam/Free_Sydney_13b_HF": ModelType.FT,
429
+ "huggingface/llama-13b": ModelType.PT,
430
+ "huggingface/llama-7b": ModelType.PT,
431
+ "huggingface/llama-65b": ModelType.PT,
432
+ "huggingface/llama-30b": ModelType.PT,
433
+ "Henk717/chronoboros-33B": ModelType.IFT,
434
+ "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
435
+ "jondurbin/airoboros-7b": ModelType.IFT,
436
+ "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
437
+ "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
438
+ "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
439
+ "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
440
+ "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
441
+ "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
442
+ "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
443
+ "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
444
+ "jondurbin/airoboros-13b": ModelType.IFT,
445
+ "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
446
+ "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
447
+ "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
448
+ "ariellee/SuperPlatty-30B": ModelType.IFT,
449
+ "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
450
+ "cerebras/Cerebras-GPT-256M": ModelType.PT,
451
+ "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
452
+ "cerebras/Cerebras-GPT-13B": ModelType.PT,
453
+ "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
454
+ "cerebras/Cerebras-GPT-111M": ModelType.PT,
455
+ "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
456
+ "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
457
+ "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
458
+ "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
459
+ "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
460
+ "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
461
+ "NousResearch/Nous-Hermes-13b": ModelType.IFT,
462
+ "project-baize/baize-v2-7b": ModelType.IFT,
463
+ "project-baize/baize-v2-13b": ModelType.IFT,
464
+ "LLMs/WizardLM-13B-V1.0": ModelType.FT,
465
+ "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
466
+ "wenge-research/yayi-7b": ModelType.FT,
467
+ "wenge-research/yayi-7b-llama2": ModelType.FT,
468
+ "wenge-research/yayi-13b-llama2": ModelType.FT,
469
+ "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
470
+ "llama-anon/instruct-13b": ModelType.IFT,
471
+ "huggingtweets/jerma985": ModelType.FT,
472
+ "huggingtweets/gladosystem": ModelType.FT,
473
+ "huggingtweets/bladeecity-jerma985": ModelType.FT,
474
+ "huggyllama/llama-13b": ModelType.PT,
475
+ "huggyllama/llama-65b": ModelType.PT,
476
+ "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
477
+ "upstage/Llama-2-70b-instruct": ModelType.IFT,
478
+ "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
479
+ "upstage/llama-65b-instruct": ModelType.IFT,
480
+ "upstage/llama-30b-instruct-2048": ModelType.IFT,
481
+ "upstage/llama-30b-instruct": ModelType.IFT,
482
+ "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
483
+ "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
484
+ "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
485
+ "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
486
+ "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
487
+ "gpt2": ModelType.PT,
488
+ "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
489
+ "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
490
+ "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
491
+ "quantumaikr/KoreanLM-hf": ModelType.FT,
492
+ "quantumaikr/open_llama_7b_hf": ModelType.FT,
493
+ "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
494
+ "MayaPH/FinOPT-Lincoln": ModelType.FT,
495
+ "MayaPH/FinOPT-Franklin": ModelType.FT,
496
+ "MayaPH/GodziLLa-30B": ModelType.IFT,
497
+ "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
498
+ "MayaPH/FinOPT-Washington": ModelType.FT,
499
+ "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
500
+ "layoric/llama-2-13b-code-alpaca": ModelType.FT,
501
+ "CobraMamba/mamba-gpt-3b": ModelType.FT,
502
+ "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
503
+ "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
504
+ "timdettmers/guanaco-33b-merged": ModelType.FT,
505
+ "elinas/chronos-33b": ModelType.IFT,
506
+ "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
507
+ "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
508
+ "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
509
+ "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
510
+ "meta-llama/Llama-2-7b-hf": ModelType.PT,
511
+ "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
512
+ "meta-llama/Llama-2-13b-hf": ModelType.PT,
513
+ "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
514
+ "meta-llama/Llama-2-70b-hf": ModelType.PT,
515
+ "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
516
+ "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
517
+ "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
518
+ "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
519
+ "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
520
+ "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
521
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
522
+ "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
523
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
524
+ "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
525
+ "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
526
+ "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
527
+ "bofenghuang/vigogne-13b-chat": ModelType.FT,
528
+ "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
529
+ "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
530
+ "bofenghuang/vigogne-7b-chat": ModelType.FT,
531
+ "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
532
+ "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
533
+ "ewof/koishi-instruct-3b": ModelType.IFT,
534
+ "gywy/llama2-13b-chinese-v1": ModelType.FT,
535
+ "GOAT-AI/GOAT-7B-Community": ModelType.FT,
536
+ "psyche/kollama2-7b": ModelType.FT,
537
+ "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
538
+ "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
539
+ "augtoma/qCammel-70-x": ModelType.IFT,
540
+ "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
541
+ "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
542
+ "64bits/LexPodLM-13B": ModelType.FT,
543
+ }
544
+
545
+
546
+ def model_type_from_str(type):
547
+ if "fine-tuned" in type or "🔶" in type:
548
+ return ModelType.FT
549
+ if "pretrained" in type or "🟢" in type:
550
+ return ModelType.PT
551
+ if "RL-tuned" in type or "🟦" in type:
552
+ return ModelType.RL
553
+ if "instruction-tuned" in type or "⭕" in type:
554
+ return ModelType.IFT
555
+ return ModelType.Unknown
src/display_models/modelcard_filter.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import ModelCard
3
+
4
+
5
+ # ht to @Wauplin, thank you for the snippet!
6
+ # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
7
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
8
+ # Returns operation status, and error message
9
+ try:
10
+ card = ModelCard.load(repo_id)
11
+ except huggingface_hub.utils.EntryNotFoundError:
12
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
13
+
14
+ # Enforce license metadata
15
+ if card.data.license is None:
16
+ if not ("license_name" in card.data and "license_link" in card.data):
17
+ return False, (
18
+ "License not found. Please add a license to your model card using the `license` metadata or a"
19
+ " `license_name`/`license_link` pair."
20
+ )
21
+
22
+ # Enforce card content
23
+ if len(card.text) < 200:
24
+ return False, "Please add a description to your model card, it is too short."
25
+
26
+ return True, ""
src/display_models/read_results.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Tuple
5
+
6
+ import dateutil
7
+ import numpy as np
8
+
9
+ from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
+
11
+ METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
+ BENCH_TO_NAME = {
14
+ "arc:challenge": AutoEvalColumn.arc.name,
15
+ "hellaswag": AutoEvalColumn.hellaswag.name,
16
+ "hendrycksTest": AutoEvalColumn.mmlu.name,
17
+ "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
18
+ }
19
+
20
+
21
+ @dataclass
22
+ class EvalResult:
23
+ eval_name: str
24
+ org: str
25
+ model: str
26
+ revision: str
27
+ results: dict
28
+ precision: str = ""
29
+ model_type: str = ""
30
+ weight_type: str = "Original"
31
+ date: str = ""
32
+
33
+ def to_dict(self):
34
+ from src.load_from_hub import is_model_on_hub
35
+
36
+ if self.org is not None:
37
+ base_model = f"{self.org}/{self.model}"
38
+ else:
39
+ base_model = f"{self.model}"
40
+ data_dict = {}
41
+
42
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
43
+ data_dict["weight_type"] = self.weight_type # not a column, just a save name
44
+ data_dict[AutoEvalColumn.precision.name] = self.precision
45
+ data_dict[AutoEvalColumn.model_type.name] = self.model_type
46
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
47
+ data_dict[AutoEvalColumn.dummy.name] = base_model
48
+ data_dict[AutoEvalColumn.revision.name] = self.revision
49
+ data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
50
+ data_dict[AutoEvalColumn.still_on_hub.name] = (
51
+ is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
52
+ )
53
+
54
+ for benchmark in BENCHMARKS:
55
+ if benchmark not in self.results.keys():
56
+ self.results[benchmark] = None
57
+
58
+ for k, v in BENCH_TO_NAME.items():
59
+ data_dict[v] = self.results[k]
60
+
61
+ return data_dict
62
+
63
+
64
+ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
65
+ with open(json_filepath) as fp:
66
+ data = json.load(fp)
67
+
68
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
69
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
70
+ return None, [] # we skip models with the wrong version
71
+
72
+ try:
73
+ config = data["config"]
74
+ except KeyError:
75
+ config = data["config_general"]
76
+ model = config.get("model_name", None)
77
+ if model is None:
78
+ model = config.get("model_args", None)
79
+
80
+ model_sha = config.get("model_sha", "")
81
+ model_split = model.split("/", 1)
82
+
83
+ precision = config.get("model_dtype")
84
+
85
+ model = model_split[-1]
86
+
87
+ if len(model_split) == 1:
88
+ org = None
89
+ model = model_split[0]
90
+ result_key = f"{model}_{precision}"
91
+ else:
92
+ org = model_split[0]
93
+ model = model_split[1]
94
+ result_key = f"{org}_{model}_{precision}"
95
+
96
+ eval_results = []
97
+ for benchmark, metric in zip(BENCHMARKS, METRICS):
98
+ accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
99
+ if accs.size == 0 or any([acc is None for acc in accs]):
100
+ continue
101
+ mean_acc = np.mean(accs) * 100.0
102
+ eval_results.append(
103
+ EvalResult(
104
+ eval_name=result_key,
105
+ org=org,
106
+ model=model,
107
+ revision=model_sha,
108
+ results={benchmark: mean_acc},
109
+ precision=precision, # todo model_type=, weight_type=
110
+ date=config.get("submission_date"),
111
+ )
112
+ )
113
+
114
+ return result_key, eval_results
115
+
116
+
117
+ def get_eval_results() -> List[EvalResult]:
118
+ json_filepaths = []
119
+
120
+ for root, dir, files in os.walk("eval-results"):
121
+ # We should only have json files in model results
122
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
123
+ continue
124
+
125
+ # Sort the files by date
126
+ # store results by precision maybe?
127
+ try:
128
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
129
+ except dateutil.parser._parser.ParserError:
130
+ files = [files[-1]]
131
+
132
+ # up_to_date = files[-1]
133
+ for file in files:
134
+ json_filepaths.append(os.path.join(root, file))
135
+
136
+ eval_results = {}
137
+ for json_filepath in json_filepaths:
138
+ result_key, results = parse_eval_result(json_filepath)
139
+ for eval_result in results:
140
+ if result_key in eval_results.keys():
141
+ eval_results[result_key].results.update(eval_result.results)
142
+ else:
143
+ eval_results[result_key] = eval_result
144
+
145
+ eval_results = [v for v in eval_results.values()]
146
+
147
+ return eval_results
148
+
149
+
150
+ def get_eval_results_dicts() -> List[Dict]:
151
+ eval_results = get_eval_results()
152
+
153
+ return [e.to_dict() for e in eval_results]
src/display_models/utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ API = HfApi()
7
+
8
+
9
+ # These classes are for user facing column names, to avoid having to change them
10
+ # all around the code when a modif is needed
11
+ @dataclass
12
+ class ColumnContent:
13
+ name: str
14
+ type: str
15
+ displayed_by_default: bool
16
+ hidden: bool = False
17
+
18
+
19
+ def fields(raw_class):
20
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class AutoEvalColumn: # Auto evals column
25
+ model_type_symbol = ColumnContent("T", "str", True)
26
+ model = ColumnContent("Model", "markdown", True)
27
+ average = ColumnContent("Average ⬆️", "number", True)
28
+ arc = ColumnContent("ARC", "number", True)
29
+ hellaswag = ColumnContent("HellaSwag", "number", True)
30
+ mmlu = ColumnContent("MMLU", "number", True)
31
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
+ model_type = ColumnContent("Type", "str", False)
33
+ precision = ColumnContent("Precision", "str", False) # , True)
34
+ license = ColumnContent("Hub License", "str", False)
35
+ params = ColumnContent("#Params (B)", "number", False)
36
+ likes = ColumnContent("Hub ❤️", "number", False)
37
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
38
+ revision = ColumnContent("Model sha", "str", False, False)
39
+ dummy = ColumnContent(
40
+ "model_name_for_query", "str", True
41
+ ) # dummy col to implement search bar (hidden by custom CSS)
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class EloEvalColumn: # Elo evals column
46
+ model = ColumnContent("Model", "markdown", True)
47
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
48
+ human_all = ColumnContent("Human (all)", "number", True)
49
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
50
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class EvalQueueColumn: # Queue column
55
+ model = ColumnContent("model", "markdown", True)
56
+ revision = ColumnContent("revision", "str", True)
57
+ private = ColumnContent("private", "bool", True)
58
+ precision = ColumnContent("precision", "str", True)
59
+ weight_type = ColumnContent("weight_type", "str", "Original")
60
+ status = ColumnContent("status", "str", True)
61
+
62
+
63
+ LLAMAS = [
64
+ "huggingface/llama-7b",
65
+ "huggingface/llama-13b",
66
+ "huggingface/llama-30b",
67
+ "huggingface/llama-65b",
68
+ ]
69
+
70
+
71
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
72
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
73
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
74
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
75
+ MODEL_PAGE = "https://huggingface.co/models"
76
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
77
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
78
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
79
+
80
+
81
+ def model_hyperlink(link, model_name):
82
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
83
+
84
+
85
+ def make_clickable_model(model_name):
86
+ link = f"https://huggingface.co/{model_name}"
87
+
88
+ if model_name in LLAMAS:
89
+ link = LLAMA_LINK
90
+ model_name = model_name.split("/")[1]
91
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
92
+ link = VICUNA_LINK
93
+ model_name = "stable-vicuna-13b"
94
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
95
+ link = ALPACA_LINK
96
+ model_name = "alpaca-13b"
97
+ if model_name == "dolly-12b":
98
+ link = DOLLY_LINK
99
+ elif model_name == "vicuna-13b":
100
+ link = VICUNA_LINK
101
+ elif model_name == "koala-13b":
102
+ link = KOALA_LINK
103
+ elif model_name == "oasst-12b":
104
+ link = OASST_LINK
105
+
106
+ details_model_name = model_name.replace("/", "__")
107
+ details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
108
+
109
+ if not bool(os.getenv("DEBUG", "False")):
110
+ # We only add these checks when not debugging, as they are extremely slow
111
+ print(f"details_link: {details_link}")
112
+ try:
113
+ check_path = list(
114
+ API.list_files_info(
115
+ repo_id=f"open-llm-leaderboard/details_{details_model_name}",
116
+ paths="README.md",
117
+ repo_type="dataset",
118
+ )
119
+ )
120
+ print(f"check_path: {check_path}")
121
+ except Exception as err:
122
+ # No details repo for this model
123
+ print(f"No details repo for this model: {err}")
124
+ return model_hyperlink(link, model_name)
125
+
126
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
127
+
128
+
129
+ def styled_error(error):
130
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
131
+
132
+
133
+ def styled_warning(warn):
134
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
135
+
136
+
137
+ def styled_message(message):
138
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
139
+
140
+
141
+ def has_no_nan_values(df, columns):
142
+ return df[columns].notna().all(axis=1)
143
+
144
+
145
+ def has_nan_values(df, columns):
146
+ return df[columns].isna().any(axis=1)
src/envs.py DELETED
@@ -1,32 +0,0 @@
1
- import os
2
- from huggingface_hub import HfApi
3
-
4
- # clone / pull the lmeh eval data
5
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
-
7
- REPO_ID = "open-llm-leaderboard/open_llm_leaderboard"
8
- QUEUE_REPO = "open-llm-leaderboard/requests"
9
- AGGREGATED_REPO = "open-llm-leaderboard/contents"
10
- VOTES_REPO = "open-llm-leaderboard/votes"
11
-
12
- HF_HOME = os.getenv("HF_HOME", ".")
13
-
14
- # Check HF_HOME write access
15
- print(f"Initial HF_HOME set to: {HF_HOME}")
16
-
17
- if not os.access(HF_HOME, os.W_OK):
18
- print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
19
- HF_HOME = "."
20
- os.environ["HF_HOME"] = HF_HOME
21
- else:
22
- print("Write access confirmed for HF_HOME")
23
-
24
- VOTES_PATH = os.path.join(HF_HOME, "model-votes")
25
- EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
26
-
27
- # Rate limit variables
28
- RATE_LIMIT_PERIOD = 7
29
- RATE_LIMIT_QUOTA = 5
30
- HAS_HIGHER_RATE_LIMIT = []
31
-
32
- API = HfApi(token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/filter_models.py DELETED
@@ -1,75 +0,0 @@
1
- from src.display.formatting import model_hyperlink
2
- from src.display.utils import AutoEvalColumn
3
-
4
-
5
- # Models which have been flagged by users as being problematic for a reason or another
6
- # (Model name to forum discussion link)
7
- # None for the v2 so far!
8
- FLAGGED_MODELS = {}
9
-
10
- # Models which have been requested by orgs to not be submitted on the leaderboard
11
- DO_NOT_SUBMIT_MODELS = [
12
- "Voicelab/trurl-2-13b", # trained on MMLU
13
- "TigerResearch/tigerbot-70b-chat", # per authors request
14
- "TigerResearch/tigerbot-70b-chat-v2", # per authors request
15
- "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
16
- ]
17
-
18
-
19
- def flag_models(leaderboard_data: list[dict]):
20
- """Flags models based on external criteria or flagged status."""
21
- for model_data in leaderboard_data:
22
- # Skip flagging if maintainers_highlight is True
23
- if model_data.get(AutoEvalColumn.maintainers_highlight.name, False):
24
- model_data[AutoEvalColumn.not_flagged.name] = True
25
- continue
26
-
27
- # If a model is not flagged, use its "fullname" as a key
28
- if model_data[AutoEvalColumn.not_flagged.name]:
29
- flag_key = model_data[AutoEvalColumn.fullname.name]
30
- else:
31
- flag_key = None
32
-
33
- # Reverse the logic: Check for non-flagged models instead
34
- if flag_key in FLAGGED_MODELS:
35
- issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
36
- issue_link = model_hyperlink(
37
- FLAGGED_MODELS[flag_key],
38
- f"See discussion #{issue_num}",
39
- )
40
- model_data[AutoEvalColumn.model.name] = (
41
- f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
42
- )
43
- model_data[AutoEvalColumn.not_flagged.name] = False
44
- else:
45
- model_data[AutoEvalColumn.not_flagged.name] = True
46
-
47
-
48
- def remove_forbidden_models(leaderboard_data: list[dict]):
49
- """Removes models from the leaderboard based on the DO_NOT_SUBMIT list."""
50
- indices_to_remove = []
51
- for ix, model in enumerate(leaderboard_data):
52
- if model[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
53
- indices_to_remove.append(ix)
54
-
55
- # Remove the models from the list
56
- for ix in reversed(indices_to_remove):
57
- leaderboard_data.pop(ix)
58
- return leaderboard_data
59
-
60
- """
61
- def remove_forbidden_models(leaderboard_data):
62
- #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
63
- indices_to_remove = []
64
- for ix, row in leaderboard_data.iterrows():
65
- if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
66
- indices_to_remove.append(ix)
67
-
68
- # Remove the models from the list
69
- return leaderboard_data.drop(indices_to_remove)
70
- """
71
-
72
-
73
- def filter_models_flags(leaderboard_data: list[dict]):
74
- leaderboard_data = remove_forbidden_models(leaderboard_data)
75
- flag_models(leaderboard_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/load_from_hub.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from huggingface_hub import Repository
6
+ from transformers import AutoConfig
7
+ from collections import defaultdict
8
+
9
+ from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
10
+ from src.display_models.get_model_metadata import apply_metadata
11
+ from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
12
+ from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
13
+
14
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
+
16
+
17
+ def get_all_requested_models(requested_models_dir: str) -> set[str]:
18
+ depth = 1
19
+ file_names = []
20
+ users_to_submission_dates = defaultdict(list)
21
+
22
+ for root, _, files in os.walk(requested_models_dir):
23
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
24
+ if current_depth == depth:
25
+ for file in files:
26
+ if not file.endswith(".json"):
27
+ continue
28
+ with open(os.path.join(root, file), "r") as f:
29
+ info = json.load(f)
30
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
31
+
32
+ # Select organisation
33
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
34
+ continue
35
+ organisation, _ = info["model"].split("/")
36
+ users_to_submission_dates[organisation].append(info["submitted_time"])
37
+
38
+ return set(file_names), users_to_submission_dates
39
+
40
+
41
+ def load_all_info_from_hub(QUEUE_REPO: str, RESULTS_REPO: str, QUEUE_PATH: str, RESULTS_PATH: str) -> list[Repository]:
42
+ eval_queue_repo = None
43
+ eval_results_repo = None
44
+ requested_models = None
45
+
46
+ print("Pulling evaluation requests and results.")
47
+
48
+ eval_queue_repo = Repository(
49
+ local_dir=QUEUE_PATH,
50
+ clone_from=QUEUE_REPO,
51
+ repo_type="dataset",
52
+ )
53
+ eval_queue_repo.git_pull()
54
+
55
+ eval_results_repo = Repository(
56
+ local_dir=RESULTS_PATH,
57
+ clone_from=RESULTS_REPO,
58
+ repo_type="dataset",
59
+ )
60
+ eval_results_repo.git_pull()
61
+
62
+ requested_models, users_to_submission_dates = get_all_requested_models("eval-queue")
63
+
64
+ return eval_queue_repo, requested_models, eval_results_repo, users_to_submission_dates
65
+
66
+
67
+ def get_leaderboard_df(
68
+ eval_results: Repository, eval_results_private: Repository, cols: list, benchmark_cols: list
69
+ ) -> pd.DataFrame:
70
+ if eval_results:
71
+ print("Pulling evaluation results for the leaderboard.")
72
+ eval_results.git_pull()
73
+ if eval_results_private:
74
+ print("Pulling evaluation results for the leaderboard.")
75
+ eval_results_private.git_pull()
76
+
77
+ all_data = get_eval_results_dicts()
78
+
79
+ if not IS_PUBLIC:
80
+ all_data.append(gpt4_values)
81
+ all_data.append(gpt35_values)
82
+
83
+ all_data.append(baseline)
84
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
85
+
86
+ df = pd.DataFrame.from_records(all_data)
87
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
88
+ df = df[cols].round(decimals=2)
89
+
90
+ # filter out if any of the benchmarks have not been produced
91
+ df = df[has_no_nan_values(df, benchmark_cols)]
92
+ return df
93
+
94
+
95
+ def get_evaluation_queue_df(
96
+ eval_queue: Repository, eval_queue_private: Repository, save_path: str, cols: list
97
+ ) -> list[pd.DataFrame]:
98
+ if eval_queue:
99
+ print("Pulling changes for the evaluation queue.")
100
+ eval_queue.git_pull()
101
+ if eval_queue_private:
102
+ print("Pulling changes for the evaluation queue.")
103
+ eval_queue_private.git_pull()
104
+
105
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
106
+ all_evals = []
107
+
108
+ for entry in entries:
109
+ if ".json" in entry:
110
+ file_path = os.path.join(save_path, entry)
111
+ with open(file_path) as fp:
112
+ data = json.load(fp)
113
+
114
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
115
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
116
+
117
+ all_evals.append(data)
118
+ elif ".md" not in entry:
119
+ # this is a folder
120
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
121
+ for sub_entry in sub_entries:
122
+ file_path = os.path.join(save_path, entry, sub_entry)
123
+ with open(file_path) as fp:
124
+ data = json.load(fp)
125
+
126
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
127
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
128
+ all_evals.append(data)
129
+
130
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
131
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
132
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
133
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
134
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
135
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
136
+ return df_finished[cols], df_running[cols], df_pending[cols]
137
+
138
+
139
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
140
+ try:
141
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
142
+ return True, None
143
+
144
+ except ValueError:
145
+ return (
146
+ False,
147
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
148
+ )
149
+
150
+ except Exception as e:
151
+ print(f"Could not get the model config from the hub.: {e}")
152
+ return False, "was not found on hub!"
src/populate.py DELETED
@@ -1,54 +0,0 @@
1
- import pathlib
2
- import pandas as pd
3
- from datasets import Dataset
4
- from src.display.formatting import has_no_nan_values, make_clickable_model
5
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
6
- from src.leaderboard.filter_models import filter_models_flags
7
- from src.display.utils import load_json_data
8
-
9
-
10
- def _process_model_data(entry, model_name_key="model", revision_key="revision"):
11
- """Enrich model data with clickable links and revisions."""
12
- entry[EvalQueueColumn.model_name.name] = entry.get(model_name_key, "")
13
- entry[EvalQueueColumn.model_link.name] = make_clickable_model(entry.get(model_name_key, ""))
14
- entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
15
- return entry
16
-
17
-
18
- def get_evaluation_queue_df(save_path, cols):
19
- """Generate dataframes for pending, running, and finished evaluation entries."""
20
- save_path = pathlib.Path(save_path)
21
- all_evals = []
22
-
23
- for path in save_path.rglob("*.json"):
24
- data = load_json_data(path)
25
- if data:
26
- all_evals.append(_process_model_data(data))
27
-
28
- # Organizing data by status
29
- status_map = {
30
- "PENDING": ["PENDING", "RERUN"],
31
- "RUNNING": ["RUNNING"],
32
- "FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
33
- }
34
- status_dfs = {status: [] for status in status_map}
35
- for eval_data in all_evals:
36
- for status, extra_statuses in status_map.items():
37
- if eval_data["status"] in extra_statuses:
38
- status_dfs[status].append(eval_data)
39
-
40
- return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
41
-
42
-
43
- def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
44
- """Retrieve and process leaderboard data."""
45
- all_data_json = leaderboard_dataset.to_dict()
46
- num_items = leaderboard_dataset.num_rows
47
- all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
48
- filter_models_flags(all_data_json_list)
49
-
50
- df = pd.DataFrame.from_records(all_data_json_list)
51
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
52
- df = df[cols].round(decimals=2)
53
- df = df[has_no_nan_values(df, benchmark_cols)]
54
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/rate_limiting.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timezone, timedelta
2
+
3
+
4
+ def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
5
+ org_or_user, _ = submission_name.split("/")
6
+ if org_or_user not in users_to_submission_dates:
7
+ return 0
8
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
9
+
10
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
11
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
12
+
13
+ return len(submissions_after_timelimit)
src/submission/check_validity.py DELETED
@@ -1,183 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
10
- from transformers import AutoConfig, AutoTokenizer
11
-
12
- from src.display.utils import parse_iso8601_datetime
13
- from src.envs import HAS_HIGHER_RATE_LIMIT
14
-
15
-
16
- # ht to @Wauplin, thank you for the snippet!
17
- # See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
18
- def check_model_card(repo_id: str) -> tuple[bool, str]:
19
- # Returns operation status, and error message
20
- try:
21
- card = ModelCard.load(repo_id)
22
- except huggingface_hub.utils.EntryNotFoundError:
23
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
24
-
25
- # Enforce license metadata
26
- if card.data.license is None:
27
- if not ("license_name" in card.data and "license_link" in card.data):
28
- return (
29
- False,
30
- (
31
- "License not found. Please add a license to your model card using the `license` metadata or a"
32
- " `license_name`/`license_link` pair."
33
- ),
34
- None,
35
- )
36
-
37
- # Enforce card content
38
- if len(card.text) < 200:
39
- return False, "Please add a description to your model card, it is too short.", None
40
-
41
- return True, "", card
42
-
43
-
44
- def is_model_on_hub(
45
- model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
46
- ) -> tuple[bool, str, AutoConfig]:
47
- try:
48
- config = AutoConfig.from_pretrained(
49
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
50
- ) # , force_download=True)
51
- if test_tokenizer:
52
- try:
53
- tk = AutoTokenizer.from_pretrained(
54
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
55
- )
56
- except ValueError as e:
57
- return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
58
- except Exception:
59
- return (
60
- False,
61
- "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
62
- None,
63
- )
64
- return True, None, config
65
-
66
- except ValueError:
67
- return (
68
- False,
69
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
70
- None,
71
- )
72
-
73
- except Exception as e:
74
- if "You are trying to access a gated repo." in str(e):
75
- return True, "uses a gated model.", None
76
- return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
77
-
78
-
79
- def get_model_size(model_info: ModelInfo, precision: str):
80
- size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
81
- safetensors = None
82
- try:
83
- safetensors = get_safetensors_metadata(model_info.id)
84
- except Exception as e:
85
- print(e)
86
-
87
- if safetensors is not None:
88
- model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
89
- else:
90
- try:
91
- size_match = re.search(size_pattern, model_info.id.lower())
92
- model_size = size_match.group(0)
93
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
94
- except AttributeError:
95
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
96
-
97
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
98
- model_size = size_factor * model_size
99
- return model_size
100
-
101
-
102
- def get_model_arch(model_info: ModelInfo):
103
- return model_info.config.get("architectures", "Unknown")
104
-
105
-
106
- def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
107
- # Increase quota first if user has higher limits
108
- if org_or_user in HAS_HIGHER_RATE_LIMIT:
109
- rate_limit_quota *= 2
110
-
111
- if org_or_user not in users_to_submission_dates:
112
- return True, ""
113
-
114
- submission_dates = sorted(users_to_submission_dates[org_or_user])
115
- time_limit = datetime.now(timezone.utc) - timedelta(days=rate_limit_period)
116
-
117
- submissions_after_timelimit = [
118
- parse_iso8601_datetime(d) for d in submission_dates
119
- if parse_iso8601_datetime(d) > time_limit
120
- ]
121
-
122
- num_models_submitted_in_period = len(submissions_after_timelimit)
123
-
124
- # Use >= to correctly enforce the rate limit
125
- if num_models_submitted_in_period >= rate_limit_quota:
126
- error_msg = f"Organisation or user `{org_or_user}` already has {num_models_submitted_in_period} model requests submitted in the last {rate_limit_period} days.\n"
127
- error_msg += "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
128
- return False, error_msg
129
-
130
- return True, ""
131
-
132
-
133
- def already_submitted_models(requested_models_dir: str) -> set[str]:
134
- depth = 1
135
- file_names = []
136
- users_to_submission_dates = defaultdict(list)
137
-
138
- for root, _, files in os.walk(requested_models_dir):
139
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
140
- if current_depth == depth:
141
- for file in files:
142
- if not file.endswith(".json"):
143
- continue
144
- with open(os.path.join(root, file), "r") as f:
145
- info = json.load(f)
146
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
147
-
148
- # Select organisation
149
- if info["model"].count("/") == 0 or "submitted_time" not in info:
150
- continue
151
- organisation, _ = info["model"].split("/")
152
- users_to_submission_dates[organisation].append(info["submitted_time"])
153
-
154
- return set(file_names), users_to_submission_dates
155
-
156
-
157
- def get_model_tags(model_card, model: str):
158
- is_merge_from_metadata = False
159
- is_moe_from_metadata = False
160
-
161
- tags = []
162
- if model_card is None:
163
- return tags
164
- if model_card.data.tags:
165
- is_merge_from_metadata = any(
166
- [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
167
- )
168
- is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
169
-
170
- is_merge_from_model_card = any(
171
- keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
172
- )
173
- if is_merge_from_model_card or is_merge_from_metadata:
174
- tags.append("merge")
175
- is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
176
- # Hardcoding because of gating problem
177
- if "Qwen/Qwen1.5-32B" in model:
178
- is_moe_from_model_card = False
179
- is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
180
- if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
181
- tags.append("moe")
182
-
183
- return tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,186 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from dataclasses import dataclass
6
- from transformers import AutoConfig
7
-
8
- from src.display.formatting import styled_error, styled_message, styled_warning
9
- from src.envs import (
10
- API,
11
- EVAL_REQUESTS_PATH,
12
- HF_TOKEN,
13
- QUEUE_REPO,
14
- RATE_LIMIT_PERIOD,
15
- RATE_LIMIT_QUOTA,
16
- )
17
- from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
18
- from src.submission.check_validity import (
19
- already_submitted_models,
20
- check_model_card,
21
- get_model_size,
22
- is_model_on_hub,
23
- user_submission_permission,
24
- )
25
-
26
- REQUESTED_MODELS = None
27
- USERS_TO_SUBMISSION_DATES = None
28
-
29
- @dataclass
30
- class ModelSizeChecker:
31
- model: str
32
- precision: str
33
- model_size_in_b: float
34
-
35
- def get_precision_factor(self):
36
- if self.precision in ["float16", "bfloat16"]:
37
- return 1
38
- elif self.precision == "8bit":
39
- return 2
40
- elif self.precision == "4bit":
41
- return 4
42
- elif self.precision == "GPTQ":
43
- config = AutoConfig.from_pretrained(self.model)
44
- num_bits = int(config.quantization_config["bits"])
45
- bits_to_precision_factor = {2: 8, 3: 6, 4: 4, 8: 2}
46
- return bits_to_precision_factor.get(num_bits, 1)
47
- else:
48
- raise Exception(f"Unknown precision {self.precision}.")
49
-
50
- def can_evaluate(self):
51
- precision_factor = self.get_precision_factor()
52
- return self.model_size_in_b <= 140 * precision_factor
53
-
54
- def add_new_eval(
55
- model: str,
56
- base_model: str,
57
- revision: str,
58
- precision: str,
59
- weight_type: str,
60
- model_type: str,
61
- use_chat_template: bool,
62
- ):
63
- global REQUESTED_MODELS
64
- global USERS_TO_SUBMISSION_DATES
65
- if not REQUESTED_MODELS:
66
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
67
-
68
- user_name = ""
69
- model_path = model
70
- if "/" in model:
71
- user_name = model.split("/")[0]
72
- model_path = model.split("/")[1]
73
-
74
- precision = precision.split(" ")[0]
75
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
76
-
77
- if model_type is None or model_type == "":
78
- return styled_error("Please select a model type.")
79
-
80
- # Is the user rate limited?
81
- if user_name != "":
82
- user_can_submit, error_msg = user_submission_permission(
83
- user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
84
- )
85
- if not user_can_submit:
86
- return styled_error(error_msg)
87
-
88
- # Did the model authors forbid its submission to the leaderboard?
89
- if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
90
- return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
91
-
92
- # Does the model actually exist?
93
- if revision == "":
94
- revision = "main"
95
- try:
96
- model_info = API.model_info(repo_id=model, revision=revision)
97
- except Exception as e:
98
- return styled_error("Could not get your model information. Please fill it up properly.")
99
-
100
- # Check model size early
101
- model_size = get_model_size(model_info=model_info, precision=precision)
102
-
103
- # First check: Absolute size limit for float16 and bfloat16
104
- if precision in ["float16", "bfloat16"] and model_size > 100:
105
- return styled_error(f"Sadly, models larger than 100B parameters cannot be submitted in {precision} precision at this time. "
106
- f"Your model size: {model_size:.2f}B parameters.")
107
-
108
- # Second check: Precision-adjusted size limit for 8bit, 4bit, and GPTQ
109
- if precision in ["8bit", "4bit", "GPTQ"]:
110
- size_checker = ModelSizeChecker(model=model, precision=precision, model_size_in_b=model_size)
111
-
112
- if not size_checker.can_evaluate():
113
- precision_factor = size_checker.get_precision_factor()
114
- max_size = 140 * precision_factor
115
- return styled_error(f"Sadly, models this big ({model_size:.2f}B parameters) cannot be evaluated automatically "
116
- f"at the moment on our cluster. The maximum size for {precision} precision is {max_size:.2f}B parameters.")
117
-
118
- architecture = "?"
119
- # Is the model on the hub?
120
- if weight_type in ["Delta", "Adapter"]:
121
- base_model_on_hub, error, _ = is_model_on_hub(
122
- model_name=base_model, revision="main", token=HF_TOKEN, test_tokenizer=True
123
- )
124
- if not base_model_on_hub:
125
- return styled_error(f'Base model "{base_model}" {error}')
126
- if not weight_type == "Adapter":
127
- model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=model_info.sha, test_tokenizer=True)
128
- if not model_on_hub or model_config is None:
129
- return styled_error(f'Model "{model}" {error}')
130
- if model_config is not None:
131
- architectures = getattr(model_config, "architectures", None)
132
- if architectures:
133
- architecture = ";".join(architectures)
134
-
135
- # Were the model card and license filled?
136
- try:
137
- model_info.cardData["license"]
138
- except Exception:
139
- return styled_error("Please select a license for your model")
140
-
141
- modelcard_OK, error_msg, model_card = check_model_card(model)
142
- if not modelcard_OK:
143
- return styled_error(error_msg)
144
-
145
- # Seems good, creating the eval
146
- print("Adding new eval")
147
-
148
- eval_entry = {
149
- "model": model,
150
- "base_model": base_model,
151
- "revision": model_info.sha, # force to use the exact model commit
152
- "precision": precision,
153
- "params": model_size,
154
- "architectures": architecture,
155
- "weight_type": weight_type,
156
- "status": "PENDING",
157
- "submitted_time": current_time,
158
- "model_type": model_type,
159
- "job_id": -1,
160
- "job_start_time": None,
161
- "use_chat_template": use_chat_template,
162
- }
163
-
164
- print("Creating eval file")
165
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
166
- os.makedirs(OUT_DIR, exist_ok=True)
167
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
168
-
169
- with open(out_path, "w") as f:
170
- f.write(json.dumps(eval_entry))
171
-
172
- print("Uploading eval file")
173
- API.upload_file(
174
- path_or_fileobj=out_path,
175
- path_in_repo=out_path.split("eval-queue/")[1],
176
- repo_id=QUEUE_REPO,
177
- repo_type="dataset",
178
- commit_message=f"Add {model} to eval queue",
179
- )
180
-
181
- # Remove the local file
182
- os.remove(out_path)
183
-
184
- return styled_message(
185
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
186
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tools/create_request_file.py DELETED
@@ -1,92 +0,0 @@
1
- import json
2
- import os
3
- import pprint
4
- from datetime import datetime, timezone
5
-
6
- import click
7
- from colorama import Fore
8
- from huggingface_hub import HfApi, snapshot_download
9
-
10
- from src.display.utils import ModelType, WeightType
11
- from src.submission.check_validity import get_model_size
12
-
13
- EVAL_REQUESTS_PATH = "eval-queue"
14
- QUEUE_REPO = "open-llm-leaderboard/requests"
15
-
16
- precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
17
- model_types = [e.name for e in ModelType]
18
- weight_types = [e.name for e in WeightType]
19
-
20
-
21
- def main():
22
- api = HfApi()
23
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
24
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
25
-
26
- model_name = click.prompt("Enter model name")
27
- revision = click.prompt("Enter revision", default="main")
28
- precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
29
- model_type = click.prompt("Enter model type", type=click.Choice(model_types))
30
- weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
31
- base_model = click.prompt("Enter base model", default="")
32
- status = click.prompt("Enter status", default="FINISHED")
33
-
34
- try:
35
- model_info = api.model_info(repo_id=model_name, revision=revision)
36
- except Exception as e:
37
- print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
38
- return 1
39
-
40
- model_size = get_model_size(model_info=model_info, precision=precision)
41
-
42
- try:
43
- license = model_info.cardData["license"]
44
- except Exception:
45
- license = "?"
46
-
47
- eval_entry = {
48
- "model": model_name,
49
- "base_model": base_model,
50
- "revision": model_info.sha, # force to use the exact model commit
51
- "private": False,
52
- "precision": precision,
53
- "weight_type": weight_type,
54
- "status": status,
55
- "submitted_time": current_time,
56
- "model_type": model_type,
57
- "likes": model_info.likes,
58
- "params": model_size,
59
- "license": license,
60
- }
61
-
62
- user_name = ""
63
- model_path = model_name
64
- if "/" in model_name:
65
- user_name = model_name.split("/")[0]
66
- model_path = model_name.split("/")[1]
67
-
68
- pprint.pprint(eval_entry)
69
-
70
- if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
71
- click.echo("continuing...")
72
-
73
- out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
74
- os.makedirs(out_dir, exist_ok=True)
75
- out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
76
-
77
- with open(out_path, "w") as f:
78
- f.write(json.dumps(eval_entry))
79
-
80
- api.upload_file(
81
- path_or_fileobj=out_path,
82
- path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
83
- repo_id=QUEUE_REPO,
84
- repo_type="dataset",
85
- commit_message=f"Add {model_name} to eval queue",
86
- )
87
- else:
88
- click.echo("aborting...")
89
-
90
-
91
- if __name__ == "__main__":
92
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tools/plots.py DELETED
@@ -1,152 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import plotly.express as px
4
- from plotly.graph_objs import Figure
5
-
6
- from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
- # from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
- from src.leaderboard.filter_models import FLAGGED_MODELS
9
-
10
-
11
- def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
12
- """
13
- Generates a DataFrame containing the maximum scores until each date.
14
-
15
- :param results_df: A DataFrame containing result information including metric scores and dates.
16
- :return: A new DataFrame containing the maximum scores until each date for every metric.
17
- """
18
- # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
19
- results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
20
- results_df.sort_values(by="date", inplace=True)
21
-
22
- # Step 2: Initialize the scores dictionary
23
- scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
24
-
25
- # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
26
- for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
27
- current_max = 0
28
- last_date = ""
29
- column = task.col_name
30
- for _, row in results_df.iterrows():
31
- current_model = row[AutoEvalColumn.fullname.name]
32
- # We ignore models that are flagged/no longer on the hub/not finished
33
- to_ignore = (
34
- not row[AutoEvalColumn.still_on_hub.name]
35
- or not row[AutoEvalColumn.not_flagged.name]
36
- or current_model in FLAGGED_MODELS
37
- )
38
- if to_ignore:
39
- continue
40
-
41
- current_date = row[AutoEvalColumn.date.name]
42
- current_score = row[task.col_name]
43
-
44
- if current_score > current_max:
45
- if current_date == last_date and len(scores[column]) > 0:
46
- scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
47
- else:
48
- scores[column].append({"model": current_model, "date": current_date, "score": current_score})
49
- current_max = current_score
50
- last_date = current_date
51
-
52
- # Step 4: Return all dictionaries as DataFrames
53
- return {k: pd.DataFrame(v) for k, v in scores.items()}
54
-
55
-
56
- def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
57
- """
58
- Transforms the scores DataFrame into a new format suitable for plotting.
59
-
60
- :param scores_df: A DataFrame containing metric scores and dates.
61
- :return: A new DataFrame reshaped for plotting purposes.
62
- """
63
- # Initialize the list to store DataFrames
64
- dfs = []
65
- # Iterate over the cols and create a new DataFrame for each column
66
- for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
67
- d = scores_df[col].reset_index(drop=True)
68
- d["task"] = col
69
- dfs.append(d)
70
-
71
- # Concatenate all the created DataFrames
72
- concat_df = pd.concat(dfs, ignore_index=True)
73
-
74
- # # Sort values by 'date'
75
- # concat_df.sort_values(by="date", inplace=True)
76
- # concat_df.reset_index(drop=True, inplace=True)
77
- # return concat_df
78
-
79
-
80
- def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
81
- """
82
- Create a Plotly figure object with lines representing different metrics
83
- and horizontal dotted lines representing human baselines.
84
-
85
- :param df: The DataFrame containing the metric values, names, and dates.
86
- :param metrics: A list of strings representing the names of the metrics
87
- to be included in the plot.
88
- :param title: A string representing the title of the plot.
89
- :return: A Plotly figure object with lines representing metrics and
90
- horizontal dotted lines representing human baselines.
91
- """
92
-
93
- # Filter the DataFrame based on the specified metrics
94
- df = df[df["task"].isin(metrics)]
95
-
96
- # Filter the human baselines based on the specified metrics
97
- filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
98
-
99
- # Create a line figure using plotly express with specified markers and custom data
100
- fig = px.line(
101
- df,
102
- x="date",
103
- y="score",
104
- color="task",
105
- markers=True,
106
- custom_data=["task", "score", "model"],
107
- title=title,
108
- )
109
-
110
- # Update hovertemplate for better hover interaction experience
111
- fig.update_traces(
112
- hovertemplate="<br>".join(
113
- [
114
- "Model Name: %{customdata[2]}",
115
- "Metric Name: %{customdata[0]}",
116
- "Date: %{x}",
117
- "Metric Value: %{y}",
118
- ]
119
- )
120
- )
121
-
122
- # Update the range of the y-axis
123
- fig.update_layout(yaxis_range=[0, 100])
124
-
125
- # Create a dictionary to hold the color mapping for each metric
126
- metric_color_mapping = {}
127
-
128
- # Map each metric name to its color in the figure
129
- for trace in fig.data:
130
- metric_color_mapping[trace.name] = trace.line.color
131
-
132
- # Iterate over filtered human baselines and add horizontal lines to the figure
133
- for metric, value in filtered_human_baselines.items():
134
- color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
135
- location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
136
- # Add horizontal line with matched color and positioned annotation
137
- fig.add_hline(
138
- y=value,
139
- line_dash="dot",
140
- annotation_text=f"{metric} human baseline",
141
- annotation_position=location,
142
- annotation_font_size=10,
143
- annotation_font_color=color,
144
- line_color=color,
145
- )
146
-
147
- return fig
148
-
149
-
150
- # Example Usage:
151
- # human_baselines dictionary is defined.
152
- # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/voting/vote_system.py DELETED
@@ -1,151 +0,0 @@
1
- import json
2
- import logging
3
- import pathlib
4
- import pandas as pd
5
- import gradio as gr
6
- import schedule
7
- import time
8
- from datetime import datetime, timezone
9
- from src.display.utils import EvalQueueColumn
10
-
11
- from src.envs import API
12
-
13
- # Set up logging
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
-
17
- class VoteManager:
18
- def __init__(self, votes_path, eval_requests_path, repo_id):
19
- self.votes_path = votes_path
20
- self.eval_requests_path = eval_requests_path
21
- self.repo_id = repo_id
22
- self.vote_dataset = self.read_vote_dataset()
23
- self.vote_check_set = self.make_check_set(self.vote_dataset)
24
- self.votes_to_upload = []
25
-
26
- def init_vote_dataset(self):
27
- self.vote_dataset = self.read_vote_dataset()
28
- self.vote_check_set = self.make_check_set(self.vote_dataset)
29
-
30
- def read_vote_dataset(self):
31
- result = []
32
- votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
33
- if votes_file.exists():
34
- with open(votes_file, "r") as f:
35
- for line in f:
36
- data = json.loads(line.strip())
37
- result.append(data)
38
- result = pd.DataFrame(result)
39
- return result
40
-
41
- def make_check_set(self, vote_dataset: pd.DataFrame):
42
- result = list()
43
- for row in vote_dataset.itertuples(index=False, name='vote'):
44
- result.append((row.model, row.revision, row.username))
45
- return set(result)
46
-
47
- def get_model_revision(self, selected_model: str) -> str:
48
- """Fetch the revision for the given model from the request files."""
49
- for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
50
- if user_folder.is_dir():
51
- for file in user_folder.glob("*.json"):
52
- with open(file, "r") as f:
53
- data = json.load(f)
54
- if data.get("model") == selected_model:
55
- return data.get("revision", "main")
56
- return "main"
57
-
58
- def create_request_vote_df(self, pending_models_df: gr.Dataframe):
59
- if pending_models_df.empty or not "model_name" in pending_models_df.columns:
60
- return pending_models_df
61
- self.vote_dataset = self.read_vote_dataset()
62
- vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
63
-
64
- pending_models_df_votes = pd.merge(
65
- pending_models_df,
66
- vote_counts,
67
- left_on=["model_name", 'revision'],
68
- right_on=['model', 'revision'],
69
- how='left'
70
- )
71
- # Filling empty votes
72
- pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
73
- pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
74
- # Removing useless columns
75
- pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
76
- return pending_models_df_votes
77
-
78
- # Function to be called when a user votes for a model
79
- def add_vote(
80
- self,
81
- selected_model: str,
82
- pending_models_df: gr.Dataframe,
83
- profile: gr.OAuthProfile | None
84
- ):
85
- logger.debug(f"Type of list before usage: {type(list)}")
86
- # model_name, revision, user_id, timestamp
87
- if selected_model in ["str", ""]:
88
- gr.Warning("No model selected")
89
- return
90
-
91
- if profile is None:
92
- gr.Warning("Hub Login required")
93
- return
94
-
95
- vote_username = profile.username
96
- model_revision = self.get_model_revision(selected_model)
97
-
98
- # tuple (immutable) for checking than already voted for model
99
- check_tuple = (selected_model, model_revision, vote_username)
100
- if check_tuple in self.vote_check_set:
101
- gr.Warning("Already voted for this model")
102
- return
103
-
104
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
105
-
106
- vote_obj = {
107
- "model": selected_model,
108
- "revision": model_revision,
109
- "username": vote_username,
110
- "timestamp": current_time
111
- }
112
-
113
- # Append the vote to the JSONL file
114
- try:
115
- votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
116
- with open(votes_file, "a") as f:
117
- f.write(json.dumps(vote_obj) + "\n")
118
- logger.info(f"Vote added locally: {vote_obj}")
119
-
120
- self.votes_to_upload.append(vote_obj)
121
- except Exception as e:
122
- logger.error(f"Failed to write vote to file: {e}")
123
- gr.Warning("Failed to record vote. Please try again")
124
- return
125
-
126
- self.vote_check_set.add(check_tuple)
127
- gr.Info(f"Voted for {selected_model}")
128
-
129
- return self.create_request_vote_df(pending_models_df)
130
-
131
- def upload_votes(self):
132
- if self.votes_to_upload:
133
- votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
134
- try:
135
- with open(votes_file, "rb") as f:
136
- API.upload_file(
137
- path_or_fileobj=f,
138
- path_in_repo="votes_data.jsonl",
139
- repo_id=self.repo_id,
140
- repo_type="dataset",
141
- commit_message="Updating votes_data.jsonl with new votes",
142
- )
143
- logger.info("Votes uploaded to votes repository")
144
- self.votes_to_upload.clear()
145
- except Exception as e:
146
- logger.error(f"Failed to upload votes to repository: {e}")
147
-
148
- def run_scheduler(vote_manager):
149
- while True:
150
- schedule.run_pending()
151
- time.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/submission/test_user_submission_permission.py DELETED
@@ -1,98 +0,0 @@
1
- import unittest
2
- from unittest.mock import patch
3
- from datetime import datetime, timedelta, timezone
4
-
5
- from src.submission.check_validity import user_submission_permission
6
- from src.envs import RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
-
8
- class TestUserSubmissionPermission(unittest.TestCase):
9
-
10
- def setUp(self):
11
- self.user_name = "test_user"
12
- self.rate_limit_period = RATE_LIMIT_PERIOD
13
- self.rate_limit_quota = RATE_LIMIT_QUOTA
14
- self.fixed_now = datetime(2023, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
15
- # Submission dates that simulate various test cases
16
- self.users_to_submission_dates = {
17
- "test_user": [
18
- (self.fixed_now - timedelta(days=1)).isoformat(),
19
- (self.fixed_now - timedelta(days=2)).isoformat(),
20
- (self.fixed_now - timedelta(days=3)).isoformat(),
21
- (self.fixed_now - timedelta(days=4)).isoformat(),
22
- ]
23
- }
24
-
25
- @staticmethod
26
- def fixed_datetime_now(tz=None):
27
- return datetime(2023, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
28
-
29
- @patch('src.submission.check_validity.datetime')
30
- def test_user_below_quota(self, mock_datetime):
31
- mock_datetime.now.side_effect = self.fixed_datetime_now
32
- mock_datetime.fromisoformat = datetime.fromisoformat
33
- allowed, message = user_submission_permission(
34
- self.user_name, self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
35
- )
36
- self.assertTrue(allowed)
37
-
38
- @patch('src.submission.check_validity.datetime')
39
- def test_user_at_quota(self, mock_datetime):
40
- mock_datetime.now.side_effect = self.fixed_datetime_now
41
- mock_datetime.fromisoformat = datetime.fromisoformat
42
-
43
- # Add one more submission to reach the quota
44
- self.users_to_submission_dates["test_user"].append(self.fixed_now.isoformat())
45
-
46
- allowed, message = user_submission_permission(
47
- self.user_name, self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
48
- )
49
- self.assertFalse(allowed)
50
- expected_message = (
51
- f"Organisation or user `{self.user_name}` already has {self.rate_limit_quota} model requests submitted "
52
- f"in the last {self.rate_limit_period} days.\n"
53
- "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
54
- )
55
- self.assertEqual(message, expected_message)
56
-
57
- @patch('src.submission.check_validity.datetime')
58
- def test_user_above_quota(self, mock_datetime):
59
- mock_datetime.now.side_effect = self.fixed_datetime_now
60
- mock_datetime.fromisoformat = datetime.fromisoformat
61
- # Add more than quota submissions
62
- for _ in range(self.rate_limit_quota + 1):
63
- self.users_to_submission_dates["test_user"].append(self.fixed_now.isoformat())
64
- allowed, message = user_submission_permission(
65
- self.user_name, self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
66
- )
67
- self.assertFalse(allowed)
68
-
69
- def test_user_no_previous_submissions(self):
70
- allowed, message = user_submission_permission(
71
- "new_user", self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
72
- )
73
- self.assertTrue(allowed)
74
-
75
- @patch('src.submission.check_validity.HAS_HIGHER_RATE_LIMIT', ["specific_user"])
76
- @patch('src.submission.check_validity.datetime')
77
- def test_user_higher_rate_limit(self, mock_datetime):
78
- mock_datetime.now.side_effect = self.fixed_datetime_now
79
- mock_datetime.fromisoformat = datetime.fromisoformat
80
- self.users_to_submission_dates["specific_user"] = [self.fixed_now.isoformat()] * (self.rate_limit_quota + 1)
81
- allowed, message = user_submission_permission(
82
- "specific_user", self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
83
- )
84
- self.assertTrue(allowed)
85
-
86
- @patch('src.submission.check_validity.datetime')
87
- def test_submission_just_outside_window(self, mock_datetime):
88
- mock_datetime.now.side_effect = self.fixed_datetime_now
89
- mock_datetime.fromisoformat = datetime.fromisoformat
90
- old_submission = (self.fixed_now - timedelta(days=self.rate_limit_period, seconds=1)).isoformat()
91
- self.users_to_submission_dates["test_user"] = [old_submission]
92
- allowed, message = user_submission_permission(
93
- self.user_name, self.users_to_submission_dates, self.rate_limit_period, self.rate_limit_quota
94
- )
95
- self.assertTrue(allowed)
96
-
97
- if __name__ == '__main__':
98
- unittest.main()