open_llm_leaderboard

Running

App Files Files Community

clefourrier HF staff

Wauplin HF staff commited on Jun 3

Commit

b4ba8b7

•

1 Parent(s): 3ac217c

loading_from_contents (#766)

Browse files

- init - cleaning the code base, plus adding the new system to load from contents (4fc38646dccf6d3719eaf48a8dfd05c4a032fad0)
- added collections back to main (8618a2a9da2186516ef4dec2dd87f14322de9719)
- rm (459932d6f2d58fe06ffd5392686f723a08c9b734)
- simplified env vars (23f614e2ea6cf7dcfb37912a464e2d8c24085b70)
- test 1 with webhooks (784d3edc7dc5f5f0439a082a4d0a1cf6376416f6)
- small modif (32ea1bc7cefef89f251e4de467b3d49579d60feb)
- trying with open link (0cb7d54ebfa0af3b1fb240a5cd2d043799379791)
- Update app.py (e3b01f36af4a62b3cc3ba1cd88e665ad496fb839)
- removing share true (3cc4e3e275d1561c7aaa647db593d33d90434f1f)
- Update app.py (52608b2305c0c499835dc0a9892e57b2fa4f61af)
- Update app.py (953dbe38df6163c16df1b40daa579c81c07f72db)
- the webhooks will download the model at each update, and demo.load will restart the viewer at each page refresh (388bfbdf61f906fb0574cf8477aaf19941548368)
- added plots back (294422eeb5e3bcfb489bdf41322bbc3c7cc1632c)
- fixed! (fa8d7663cb995885cb91746a89ce1a2b3ff7f7ca)
- replace HuggingFaceH4 by open-llm-leaderboard (2acf509d0df752206adf666c682823be1a99991f)
- rm dynamic file reference (b4f48ba26897f4c72d213355f91b21555be04da8)

Co-authored-by: Lucain Pouget <Wauplin@users.noreply.huggingface.co>

Files changed (16) hide show

README.md +1 -2
app.py +129 -79
requirements.txt +1 -1
src/display/about.py +2 -2
src/display/utils.py +1 -0
src/envs.py +4 -18
src/leaderboard/filter_models.py +122 -110
src/leaderboard/read_evals.py +0 -261
src/populate.py +8 -7
src/scripts/update_all_request_files.py +0 -129
src/submission/check_validity.py +1 -1
src/submission/submit.py +3 -30
src/tools/collections.py +0 -76
src/{scripts → tools}/create_request_file.py +0 -0
src/tools/model_backlinks.py +2 -2
src/tools/plots.py +7 -13

README.md CHANGED Viewed

@@ -8,14 +8,13 @@ sdk_version: 4.9.0
 app_file: app.py
 pinned: true
 license: apache-2.0
-duplicated_from: HuggingFaceH4/open_llm_leaderboard
 fullWidth: true
 startup_duration_timeout: 1h
 space_ci:
   private: true
   secrets:
   - HF_TOKEN
-  - H4_TOKEN
 tags:
 - leaderboard
 short_description: Track, rank and evaluate open LLMs and chatbots

 app_file: app.py
 pinned: true
 license: apache-2.0
 fullWidth: true
 startup_duration_timeout: 1h
 space_ci:
   private: true
   secrets:
   - HF_TOKEN
+  - WEBHOOK_SECRET
 tags:
 - leaderboard
 short_description: Track, rank and evaluate open LLMs and chatbots

app.py CHANGED Viewed

@@ -2,10 +2,9 @@ import os
 import logging
 import time
 import gradio as gr
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-from gradio_space_ci import enable_space_ci
 from src.display.about import (
     CITATION_BUTTON_LABEL,
@@ -30,32 +29,27 @@ from src.display.utils import (
 )
 from src.envs import (
     API,
-    DYNAMIC_INFO_FILE_PATH,
-    DYNAMIC_INFO_PATH,
-    DYNAMIC_INFO_REPO,
     EVAL_REQUESTS_PATH,
-    EVAL_RESULTS_PATH,
-    H4_TOKEN,
-    IS_PUBLIC,
     QUEUE_REPO,
     REPO_ID,
-    RESULTS_REPO,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.scripts.update_all_request_files import update_dynamic_files
 from src.submission.submit import add_new_eval
-from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Start ephemeral Spaces on PRs (see config in README.md)
-enable_space_ci()
 def restart_space():
-    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 def time_diff_wrapper(func):
@@ -94,54 +88,90 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
-def init_space(full_init: bool = True):
     """Initializes the application space, loading only necessary data."""
-    if full_init:
         # These downloads only occur on full initialization
         try:
             download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
-            download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
-            download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
         except Exception:
             restart_space()
-    # Always retrieve the leaderboard DataFrame
-    raw_data, original_df = get_leaderboard_df(
-        results_path=EVAL_RESULTS_PATH,
-        requests_path=EVAL_REQUESTS_PATH,
-        dynamic_path=DYNAMIC_INFO_FILE_PATH,
-        cols=COLS,
-        benchmark_cols=BENCHMARK_COLS,
-    )
-    if full_init:
-        # Collection update only happens on full initialization
-        update_collections(original_df)
-    leaderboard_df = original_df.copy()
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
-    eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    return leaderboard_df, raw_data, original_df, eval_queue_dfs
-# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
-# This controls whether a full initialization should be performed.
-do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 # Data processing for plots now only on demand in the respective Gradio tab
 def load_and_create_plots():
-    plot_df = create_plot_df(create_scores_df(raw_data))
     return plot_df
 demo = gr.Blocks(css=custom_css)
 with demo:
@@ -150,37 +180,7 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = Leaderboard(
-                value=leaderboard_df,
-                datatype=[c.type for c in fields(AutoEvalColumn)],
-                select_columns=SelectColumns(
-                    default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-                    cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
-                    label="Select Columns to Display:",
-                ),
-                search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
-                hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-                filter_columns=[
-                    ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-                    ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-                    ColumnFilter(
-                        AutoEvalColumn.params.name,
-                        type="slider",
-                        min=0.01,
-                        max=150,
-                        label="Select the number of parameters (B)",
-                    ),
-                    ColumnFilter(
-                        AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
-                    ),
-                    ColumnFilter(
-                        AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
-                    ),
-                    ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
-                    ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
-                ],
-                bool_checkboxgroup_label="Hide models",
-            )
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
@@ -219,7 +219,6 @@ with demo:
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
@@ -290,7 +289,6 @@ with demo:
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
-                    private,
                     weight_type,
                     model_type,
                 ],
@@ -307,9 +305,61 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=3)  # restarted every 3h
-scheduler.add_job(update_dynamic_files, "interval", hours=2)  # launched every 2 hour
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import logging
 import time
 import gradio as gr
+import datasets
+from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from src.display.about import (
     CITATION_BUTTON_LABEL,
 )
 from src.envs import (
     API,
     EVAL_REQUESTS_PATH,
+    AGGREGATED_REPO,
+    HF_TOKEN,
     QUEUE_REPO,
     REPO_ID,
+    HF_HOME,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
+# This controls whether a full initialization should be performed.
+DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
 def time_diff_wrapper(func):
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
+def get_latest_data_leaderboard():
+    leaderboard_dataset = datasets.load_dataset(
+        AGGREGATED_REPO,
+        "default",
+        split="train",
+        cache_dir=HF_HOME,
+        download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        verification_mode="no_checks"
+    )
+    leaderboard_df = get_leaderboard_df(
+        leaderboard_dataset=leaderboard_dataset,
+        cols=COLS,
+        benchmark_cols=BENCHMARK_COLS,
+    )
+    return leaderboard_df
+def get_latest_data_queue():
+    eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return eval_queue_dfs
+def init_space():
     """Initializes the application space, loading only necessary data."""
+    if DO_FULL_INIT:
         # These downloads only occur on full initialization
         try:
             download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         except Exception:
             restart_space()
+    # Always redownload the leaderboard DataFrame
+    leaderboard_df = get_latest_data_leaderboard()
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
+    eval_queue_dfs = get_latest_data_queue()
+    return leaderboard_df, eval_queue_dfs
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+leaderboard_df, eval_queue_dfs = init_space()
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
 # Data processing for plots now only on demand in the respective Gradio tab
 def load_and_create_plots():
+    plot_df = create_plot_df(create_scores_df(leaderboard_df))
     return plot_df
+def init_leaderboard(dataframe):
+    return Leaderboard(
+        value = dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=150,
+                label="Select the number of parameters (B)",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
+            ),
+            ColumnFilter(
+                AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
+            ),
+            ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
+            ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
+        ],
+        bool_checkboxgroup_label="Hide models",
+    )
 demo = gr.Blocks(css=custom_css)
 with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(leaderboard_df)
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         label="Model type",
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
                     weight_type,
                     model_type,
                 ],
                 show_copy_button=True,
             )
+    demo.load(fn=get_latest_data_leaderboard, inputs=None, outputs=[leaderboard])
+    demo.load(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
+demo.queue(default_concurrency_limit=40)
+# Start ephemeral Spaces on PRs (see config in README.md)
+from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
+def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
+    # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
+    # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
+    # ht to Lucain!
+    if SPACE_ID is None:
+        print("Not in a Space: Space CI disabled.")
+        return WebhooksServer(ui=demo)
+    if IS_EPHEMERAL_SPACE:
+        print("In an ephemeral Space: Space CI disabled.")
+        return WebhooksServer(ui=demo)
+    card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
+    config = card.data.get("space_ci", {})
+    print(f"Enabling Space CI with config from README: {config}")
+    return configure_space_ci(
+        blocks=ui,
+        trusted_authors=config.get("trusted_authors"),
+        private=config.get("private", "auto"),
+        variables=config.get("variables", "auto"),
+        secrets=config.get("secrets"),
+        hardware=config.get("hardware"),
+        storage=config.get("storage"),
+    )
+# Create webhooks server (with CI url if in Space and not ephemeral)
+webhooks_server = enable_space_ci_and_return_server(ui=demo)
+# Add webhooks
+@webhooks_server.add_webhook
+async def update_leaderboard(payload: WebhookPayload) -> None:
+    """Redownloads the leaderboard dataset each time it updates"""
+    if payload.repo.type == "dataset" and payload.event.action == "update":
+        datasets.load_dataset(
+            AGGREGATED_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
+            verification_mode="no_checks"
+        )
+@webhooks_server.add_webhook
+async def update_queue(payload: WebhookPayload) -> None:
+    """Redownloads the queue dataset each time it updates"""
+    if payload.repo.type == "dataset" and payload.event.action == "update":
+        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
+webhooks_server.launch()

requirements.txt CHANGED Viewed

@@ -15,4 +15,4 @@ transformers==4.41.1
 tokenizers>=0.15.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 gradio==4.20.0
-gradio_leaderboard==0.0.8

 tokenizers>=0.15.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 gradio==4.20.0
+gradio_leaderboard==0.0.9

src/display/about.py CHANGED Viewed

@@ -81,7 +81,7 @@ To get more information about quantization, see:
 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
 ### Useful links
-- [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
 - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 ### Other cool leaderboards:
@@ -217,7 +217,7 @@ CITATION_BUTTON_TEXT = r"""
   title = {Open LLM Leaderboard},
   year = {2023},
   publisher = {Hugging Face},
-  howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
 }
 @software{eval-harness,
   author       = {Gao, Leo and

 - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
 ### Useful links
+- [Community resources](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/174)
 - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 ### Other cool leaderboards:
   title = {Open LLM Leaderboard},
   year = {2023},
   publisher = {Hugging Face},
+  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
 }
 @software{eval-harness,
   author       = {Gao, Leo and

src/display/utils.py CHANGED Viewed

@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])

 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
+auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])

src/envs.py CHANGED Viewed

@@ -2,17 +2,11 @@ import os
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
-H4_TOKEN = os.environ.get("H4_TOKEN", None)
-REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
-DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
-RESULTS_REPO = "open-llm-leaderboard/results"
-PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
-PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
-IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 HF_HOME = os.getenv("HF_HOME", ".")
@@ -27,18 +21,10 @@ else:
     print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
-DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
-DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
-EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
-EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
-PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
 HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
-API = HfApi(token=H4_TOKEN)

 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+REPO_ID = "open-llm-leaderboard/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
+AGGREGATED_REPO = "open-llm-leaderboard/contents"
 HF_HOME = os.getenv("HF_HOME", ".")
     print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
 HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
+API = HfApi(token=HF_TOKEN)

src/leaderboard/filter_models.py CHANGED Viewed

@@ -5,120 +5,120 @@ from src.display.utils import AutoEvalColumn
 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
 FLAGGED_MODELS = {
-    "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
-    "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
-    "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
-    "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
-    "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
-    "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
-    "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-    "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
-    "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "cookinai/BruinHermes": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
-    "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
     "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
-    "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
-    "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
-    "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
-    "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
-    "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
-    "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     # Merges not indicated
-    "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
-    "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
-    "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
-    "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
-    "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
-    "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
-    "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # MoErges
-    "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # Other - contamination mostly
-    "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
-    "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
-    "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
-    "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
 }
 # Models which have been requested by orgs to not be submitted on the leaderboard
@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
         leaderboard_data.pop(ix)
     return leaderboard_data
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)

 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
 FLAGGED_MODELS = {
+    "merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
+    "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
+    "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
+    "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
+    "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
+    "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
+    "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
+    "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
+    "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
+    "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
+    "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
+    "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
+    "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
     "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
+    "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
+    "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
+    "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
+    "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
+    "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
+    "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
     # Merges not indicated
+    "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
+    "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
+    "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
+    "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
+    "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
+    "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
+    "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
     # MoErges
+    "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
+    "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
     # Other - contamination mostly
+    "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
+    "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
+    "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
+    "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
 }
 # Models which have been requested by orgs to not be submitted on the leaderboard
         leaderboard_data.pop(ix)
     return leaderboard_data
+"""
+def remove_forbidden_models(leaderboard_data):
+    #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
+    indices_to_remove = []
+    for ix, row in leaderboard_data.iterrows():
+        if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
+            indices_to_remove.append(ix)
+    # Remove the models from the list
+    return leaderboard_data.drop(indices_to_remove)
+"""
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,261 +0,0 @@
-import json
-from pathlib import Path
-from json import JSONDecodeError
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Optional, Dict, List
-from tqdm import tqdm
-from tqdm.contrib.logging import logging_redirect_tqdm
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-@dataclass
-class EvalResult:
-    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: Optional[str]
-    model: str
-    revision: str  # commit hash, "" if main
-    results: Dict[str, float]
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original
-    architecture: str = "Unknown"  # From config file
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = ""  # submission date of request file
-    still_on_hub: bool = True
-    is_merge: bool = False
-    not_flagged: bool = False
-    status: str = "FINISHED"
-    # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
-    tags: List[str] = field(default_factory=list)
-    @classmethod
-    def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
-        with open(json_filepath, "r") as fp:
-            data = json.load(fp)
-        config = data.get("config_general", {})
-        precision = Precision.from_str(config.get("model_dtype", "unknown"))
-        org_and_model = config.get("model_name", "").split("/", 1)
-        org = org_and_model[0] if len(org_and_model) > 1 else None
-        model = org_and_model[-1]
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        results = cls.extract_results(data)  # Properly call the method to extract results
-        return cls(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision=config.get("model_sha", ""),
-        )
-    @staticmethod
-    def extract_results(data: Dict) -> Dict[str, float]:
-        """
-        Extract and process benchmark results from a given dict.
-        Parameters:
-        - data (Dict): A dictionary containing benchmark data. This dictionary must
-        include 'versions' and 'results' keys with respective sub-data.
-        Returns:
-        - Dict[str, float]: A dictionary where keys are benchmark names and values
-        are the processed average scores as percentages.
-        Notes:
-        - The method specifically checks for certain benchmark names to skip outdated entries.
-        - Handles NaN values by setting the corresponding benchmark result to 0.0.
-        - Averages scores across metrics for benchmarks found in the data, in a percentage format.
-        """
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We skip old mmlu entries
-            if task.benchmark == "hendrycksTest":
-                for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
-                    if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
-                        continue
-            # Some benchamrk values are NaNs, mostly truthfulQA
-            # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
-            # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
-            for k, v in data["results"].items():
-                if task.benchmark in k:
-                    if math.isnan(float(v[task.metric])):
-                        results[task.benchmark] = 0.0
-                        continue
-            # We average all scores of a given metric (mostly for mmlu)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return results
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it."""
-        try:
-            request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-            if request_file is None:
-                logging.warning(f"No request file for {self.org}/{self.model}")
-                self.status = "FAILED"
-                return
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.num_params = int(request.get("params", 0))  # Ensuring type safety
-            self.date = request.get("submitted_time", "")
-            self.architecture = request.get("architectures", "Unknown")
-            self.status = request.get("status", "FAILED")
-        except FileNotFoundError:
-            self.status = "FAILED"
-            logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
-        except JSONDecodeError:
-            self.status = "FAILED"
-            logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
-        except KeyError as e:
-            self.status = "FAILED"
-            logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
-        except Exception as e:  # Catch-all for any other unexpected exceptions
-            self.status = "FAILED"
-            logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
-    def update_with_dynamic_file_dict(self, file_dict):
-        """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
-        # Default values set for optional or potentially missing keys.
-        self.license = file_dict.get("license", "?")
-        self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
-        self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
-        self.tags = file_dict.get("tags", [])
-        # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
-        self.not_flagged = not (any("flagged" in tag for tag in self.tags))
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.fullname.name: self.full_model,
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-            AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
-            AutoEvalColumn.moe.name: not (
-                ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
-            ),
-            AutoEvalColumn.not_flagged.name: self.not_flagged,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    requests_path = Path(requests_path)
-    pattern = f"{model_name}_eval_request_*.json"
-    # Using pathlib to find files matching the pattern
-    request_files = list(requests_path.glob(pattern))
-    # Sort the files by name in descending order to mimic 'reverse=True'
-    request_files.sort(reverse=True)
-    # Select the correct request file based on 'status' and 'precision'
-    request_file = None
-    for request_file in request_files:
-        with request_file.open("r") as f:
-            req_content = json.load(f)
-            if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
-                request_file = str(request_file)
-    # Return empty string if no file found that matches criteria
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    with open(dynamic_path) as f:
-        dynamic_data = json.load(f)
-    results_path = Path(results_path)
-    model_files = list(results_path.rglob("results_*.json"))
-    model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
-    eval_results = {}
-    # Wrap model_files iteration with tqdm for progress display
-    for model_result_filepath in tqdm(model_files, desc="Processing model files"):
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        with logging_redirect_tqdm():
-            eval_result.update_with_request_file(requests_path)
-        if eval_result.full_model in dynamic_data:
-            eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
-            # Hardcoding because of gating problem
-            if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
-                eval_result.still_on_hub = True
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for k, v in eval_results.items():
-        try:
-            if v.status == "FINISHED":
-                v.to_dict()  # we test if the dict version is complete
-                results.append(v)
-        except KeyError as e:
-            logging.error(f"Error while checking model {k} {v.date} json, no key: {e}")  # not all eval values present
-            continue
-    return results

src/populate.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
-from src.leaderboard.read_evals import get_raw_eval_results
 from src.display.utils import load_json_data
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
     return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
-def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
     """Retrieve and process leaderboard data."""
-    raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
-    all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
-    filter_models_flags(all_data_json)
-    df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df

 import pathlib
 import pandas as pd
+from datasets import Dataset
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
 from src.display.utils import load_json_data
     return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
+def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
     """Retrieve and process leaderboard data."""
+    all_data_json = leaderboard_dataset.to_dict()
+    num_items = leaderboard_dataset.num_rows
+    all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
+    filter_models_flags(all_data_json_list)
+    df = pd.DataFrame.from_records(all_data_json_list)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return df

src/scripts/update_all_request_files.py DELETED Viewed

@@ -1,129 +0,0 @@
-import json
-import os
-import time
-from huggingface_hub import snapshot_download
-from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
-from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
-def update_one_model(model_id, data, models_on_the_hub):
-    # Model no longer on the hub at all
-    if model_id not in models_on_the_hub:
-        data["still_on_hub"] = False
-        data["likes"] = 0
-        data["downloads"] = 0
-        data["created_at"] = ""
-        data["tags"] = []
-        return data
-    # Grabbing model parameters
-    model_cfg = models_on_the_hub[model_id]
-    data["likes"] = model_cfg.likes
-    data["downloads"] = model_cfg.downloads
-    data["created_at"] = str(model_cfg.created_at)
-    data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
-    # Grabbing model details
-    model_name = model_id
-    if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
-        if isinstance(model_cfg.card_data.base_model, str):
-            model_name = model_cfg.card_data.base_model  # for adapters, we look at the parent model
-    still_on_hub, _, _ = is_model_on_hub(
-        model_name=model_name,
-        revision=data.get("revision"),
-        trust_remote_code=True,
-        test_tokenizer=False,
-        token=H4_TOKEN,
-    )
-    # If the model doesn't have a model card or a license, we consider it's deleted
-    if still_on_hub:
-        try:
-            status, _, model_card = check_model_card(model_id)
-            if status is False:
-                still_on_hub = False
-        except Exception:
-            model_card = None
-            still_on_hub = False
-    data["still_on_hub"] = still_on_hub
-    tags = get_model_tags(model_card, model_id) if still_on_hub else []
-    data["tags"] = tags
-    return data
-def update_models(file_path, models_on_the_hub):
-    """
-    Search through all JSON files in the specified root folder and its subfolders,
-    and update the likes key in JSON dict from value of input dict
-    """
-    seen_models = []
-    with open(file_path, "r") as f:
-        model_infos = json.load(f)
-        for model_id in model_infos.keys():
-            seen_models.append(model_id)
-            model_infos[model_id] = update_one_model(
-                model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
-            )
-    # If new requests files have been created since we started all this
-    # we grab them
-    all_models = []
-    try:
-        for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
-            if ix == 0:
-                continue
-            for file in files:
-                if "eval_request" in file:
-                    path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
-                    all_models.append(path)
-    except Exception as e:
-        print(e)
-        pass
-    for model_id in all_models:
-        if model_id not in seen_models:
-            model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
-    with open(file_path, "w") as f:
-        json.dump(model_infos, f, indent=2)
-def update_dynamic_files():
-    """This will only update metadata for models already linked in the repo, not add missing ones."""
-    snapshot_download(
-        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-    print("UPDATE_DYNAMIC: Loaded snapshot")
-    # Get models
-    start = time.time()
-    models = list(
-        API.list_models(
-            # filter=ModelFilter(task="text-generation"),
-            full=False,
-            cardData=True,
-            fetch_config=True,
-        )
-    )
-    id_to_model = {model.id: model for model in models}
-    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
-    start = time.time()
-    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
-    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
-    API.upload_file(
-        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
-        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
-        repo_id=DYNAMIC_INFO_REPO,
-        repo_type="dataset",
-        commit_message="Daily request file update.",
-    )
-    print("UPDATE_DYNAMIC: pushed to hub")

src/submission/check_validity.py CHANGED Viewed

@@ -13,7 +13,7 @@ from src.envs import HAS_HIGHER_RATE_LIMIT
 # ht to @Wauplin, thank you for the snippet!
-# See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     # Returns operation status, and error message
     try:

 # ht to @Wauplin, thank you for the snippet!
+# See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     # Returns operation status, and error message
     try:

src/submission/submit.py CHANGED Viewed

@@ -2,16 +2,11 @@ import json
 import os
 from datetime import datetime, timezone
-from huggingface_hub import snapshot_download
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import (
     API,
-    DYNAMIC_INFO_FILE_PATH,
-    DYNAMIC_INFO_PATH,
-    DYNAMIC_INFO_REPO,
     EVAL_REQUESTS_PATH,
-    H4_TOKEN,
     QUEUE_REPO,
     RATE_LIMIT_PERIOD,
     RATE_LIMIT_QUOTA,
@@ -35,7 +30,6 @@ def add_new_eval(
     base_model: str,
     revision: str,
     precision: str,
-    private: bool,
     weight_type: str,
     model_type: str,
 ):
@@ -80,7 +74,7 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(
-            model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
         )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
@@ -126,7 +120,6 @@ def add_new_eval(
         "model": model,
         "base_model": base_model,
         "revision": model_info.sha, # force to use the exact model commit
-        "private": private,
         "precision": precision,
         "params": model_size,
         "architectures": architecture,
@@ -154,7 +147,7 @@ def add_new_eval(
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
@@ -168,26 +161,6 @@ def add_new_eval(
         commit_message=f"Add {model} to eval queue",
     )
-    # We want to grab the latest version of the submission file to not accidentally overwrite it
-    snapshot_download(
-        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-    with open(DYNAMIC_INFO_FILE_PATH) as f:
-        all_supplementary_info = json.load(f)
-    all_supplementary_info[model] = supplementary_info
-    with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
-        json.dump(all_supplementary_info, f, indent=2)
-    API.upload_file(
-        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
-        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
-        repo_id=DYNAMIC_INFO_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to dynamic info queue",
-    )
     # Remove the local file
     os.remove(out_path)

 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import (
     API,
     EVAL_REQUESTS_PATH,
+    HF_TOKEN,
     QUEUE_REPO,
     RATE_LIMIT_PERIOD,
     RATE_LIMIT_QUOTA,
     base_model: str,
     revision: str,
     precision: str,
     weight_type: str,
     model_type: str,
 ):
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
         base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=HF_TOKEN, test_tokenizer=True
         )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
         "model": model,
         "base_model": base_model,
         "revision": model_info.sha, # force to use the exact model commit
         "precision": precision,
         "params": model_size,
         "architectures": architecture,
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
         commit_message=f"Add {model} to eval queue",
     )
     # Remove the local file
     os.remove(out_path)

src/tools/collections.py DELETED Viewed

@@ -1,76 +0,0 @@
-import pandas as pd
-from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
-from huggingface_hub.utils._errors import HfHubHTTPError
-from pandas import DataFrame
-from src.display.utils import AutoEvalColumn, ModelType
-from src.envs import H4_TOKEN, PATH_TO_COLLECTION
-# Specific intervals for the collections
-intervals = {
-    "1B": pd.Interval(0, 1.5, closed="right"),
-    "3B": pd.Interval(2.5, 3.5, closed="neither"),
-    "7B": pd.Interval(6, 8, closed="neither"),
-    "13B": pd.Interval(10, 14, closed="neither"),
-    "30B": pd.Interval(25, 35, closed="neither"),
-    "65B": pd.Interval(60, 70, closed="neither"),
-}
-def _filter_by_type_and_size(df, model_type, size_interval):
-    """Filter DataFrame by model type and parameter size interval."""
-    type_emoji = model_type.value.symbol[0]
-    filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
-    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
-    mask = params_column.apply(lambda x: x in size_interval)
-    return filtered_df.loc[mask]
-def _add_models_to_collection(collection, models, model_type, size):
-    """Add best models to the collection and update positions."""
-    cur_len_collection = len(collection.items)
-    for ix, model in enumerate(models, start=1):
-        try:
-            collection = add_collection_item(
-                PATH_TO_COLLECTION,
-                item_id=model,
-                item_type="model",
-                exists_ok=True,
-                note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
-                token=H4_TOKEN,
-            )
-            # Ensure position is correct if item was added
-            if len(collection.items) > cur_len_collection:
-                item_object_id = collection.items[-1].item_object_id
-                update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
-                cur_len_collection = len(collection.items)
-            break  # assuming we only add the top model
-        except HfHubHTTPError:
-            continue
-def update_collections(df: DataFrame):
-    """Update collections by filtering and adding the best models."""
-    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
-    cur_best_models = []
-    for model_type in ModelType:
-        if not model_type.value.name:
-            continue
-        for size, interval in intervals.items():
-            filtered_df = _filter_by_type_and_size(df, model_type, interval)
-            best_models = list(
-                filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
-            )
-            print(model_type.value.symbol, size, best_models)
-            _add_models_to_collection(collection, best_models, model_type, size)
-            cur_best_models.extend(best_models)
-    # Cleanup
-    existing_models = {item.item_id for item in collection.items}
-    to_remove = existing_models - set(cur_best_models)
-    for item_id in to_remove:
-        try:
-            delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
-        except HfHubHTTPError:
-            continue

src/{scripts → tools}/create_request_file.py RENAMED Viewed

File without changes

src/tools/model_backlinks.py CHANGED Viewed

@@ -630,7 +630,7 @@ models = [
     "WizardLM/WizardMath-7B-V1.0",
     "Norquinal/llama-2-7b-claude-chat",
     "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
-    "HuggingFaceH4/starchat-beta",
     "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
     "conceptofmind/LLongMA-2-13b-16k",
     "tianyil1/denas-llama2",
@@ -1039,7 +1039,7 @@ models = [
     "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
     "EleutherAI/gpt-neo-2.7B",
     "danielhanchen/open_llama_3b_600bt_preview",
-    "HuggingFaceH4/starchat-alpha",
     "pythainlp/wangchanglm-7.5B-sft-en-sharded",
     "beaugogh/pythia-1.4b-deduped-sharegpt",
     "HWERI/pythia-1.4b-deduped-sharegpt",

     "WizardLM/WizardMath-7B-V1.0",
     "Norquinal/llama-2-7b-claude-chat",
     "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
+    "open-llm-leaderboard/starchat-beta",
     "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
     "conceptofmind/LLongMA-2-13b-16k",
     "tianyil1/denas-llama2",
     "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
     "EleutherAI/gpt-neo-2.7B",
     "danielhanchen/open_llama_3b_600bt_preview",
+    "open-llm-leaderboard/starchat-alpha",
     "pythainlp/wangchanglm-7.5B-sft-en-sharded",
     "beaugogh/pythia-1.4b-deduped-sharegpt",
     "HWERI/pythia-1.4b-deduped-sharegpt",

src/tools/plots.py CHANGED Viewed

@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
 from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS
-from src.leaderboard.read_evals import EvalResult
-def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
-    results_df = pd.DataFrame(raw_data)
-    # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
@@ -30,22 +28,18 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
         last_date = ""
         column = task.col_name
         for _, row in results_df.iterrows():
-            current_model = row["full_model"]
             # We ignore models that are flagged/no longer on the hub/not finished
             to_ignore = (
-                not row["still_on_hub"]
-                or not row["not_flagged"]
                 or current_model in FLAGGED_MODELS
-                or row["status"] != "FINISHED"
             )
             if to_ignore:
                 continue
-            current_date = row["date"]
-            if task.benchmark == "Average":
-                current_score = np.mean(list(row["results"].values()))
-            else:
-                current_score = row["results"][task.benchmark]
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0:

 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
 from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS
+def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
     :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
+    results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
         last_date = ""
         column = task.col_name
         for _, row in results_df.iterrows():
+            current_model = row[AutoEvalColumn.fullname.name]
             # We ignore models that are flagged/no longer on the hub/not finished
             to_ignore = (
+                not row[AutoEvalColumn.still_on_hub.name]
+                or not row[AutoEvalColumn.not_flagged.name]
                 or current_model in FLAGGED_MODELS
             )
             if to_ignore:
                 continue
+            current_date = row[AutoEvalColumn.date.name]
+            current_score = row[task.col_name]
             if current_score > current_max:
                 if current_date == last_date and len(scores[column]) > 0: