open_llm_leaderboard

Runtime error

App Files Files Community

felix commited on Jan 21, 2024

Commit

97453a2

2 Parent(s): 9c1a0d4 a6f1b1f

sync with upstream

Browse files

Files changed (14) hide show

.gitignore +1 -4
README.md +7 -1
app.py +67 -32
src/display/about.py +4 -1
src/display/css_html_js.py +20 -34
src/display/utils.py +11 -11
src/envs.py +3 -0
src/leaderboard/filter_models.py +53 -4
src/leaderboard/read_evals.py +32 -32
src/populate.py +2 -2
{scripts → src/scripts}/create_request_file.py +5 -20
src/scripts/update_all_request_files.py +109 -0
src/submission/check_validity.py +16 -11
src/submission/submit.py +73 -3

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
-auto_evals/
 venv/
 __pycache__/
 .env
@@ -6,10 +5,8 @@ __pycache__/
 *ipynb
 .vscode/
-gpt_4_evals/
-human_evals/
 eval-queue/
 eval-results/
-auto_evals/
 src/assets/model_counts.html

 venv/
 __pycache__/
 .env
 *ipynb
 .vscode/
 eval-queue/
 eval-results/
+dynamic-info/
 src/assets/model_counts.html

README.md CHANGED Viewed

@@ -4,11 +4,17 @@ emoji: 🏆
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.8.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 duplicated_from: HuggingFaceH4/open_llm_leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.9.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 duplicated_from: HuggingFaceH4/open_llm_leaderboard
+fullWidth: true
+space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
+  private: true
+  secrets:
+    - HF_TOKEN
+    - H4_TOKEN
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import os
 from datetime import datetime, timezone
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.display.about import (
@@ -30,7 +29,7 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
@@ -44,33 +43,52 @@ from src.tools.plots import (
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
-except Exception:
-    restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-update_collections(original_df.copy())
-leaderboard_df = original_df.copy()
-plot_df = create_plot_df(create_scores_df(raw_data))
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
@@ -81,10 +99,12 @@ def update_table(
     precision_query: str,
     size_query: list,
     show_deleted: bool,
     show_flagged: bool,
     query: str,
 ):
-    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     return df
@@ -100,13 +120,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
-    always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
-    ]
     # We use COLS to maintain sorting
     filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
     ]
     return filtered_df
@@ -132,7 +152,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
 def filter_models(
-    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
 ) -> pd.DataFrame:
     # Show all models
     if show_deleted:
@@ -140,6 +160,12 @@ def filter_models(
     else:  # Show only still on the hub models
         filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
     if not show_flagged:
         filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
@@ -154,7 +180,16 @@ def filter_models(
     return filtered_df
-leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, True)
 import unicodedata

 from datetime import datetime, timezone
 import pandas as pd
 from huggingface_hub import snapshot_download
 from src.display.about import (
     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+def init_space():
+    try:
+        print(EVAL_REQUESTS_PATH)
+        snapshot_download(
+            repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+        )
+    except Exception:
+        restart_space()
+    try:
+        print(DYNAMIC_INFO_PATH)
+        snapshot_download(
+            repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+        )
+    except Exception:
+        restart_space()
+    try:
+        print(EVAL_RESULTS_PATH)
+        snapshot_download(
+            repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+        )
+    except Exception:
+        restart_space()
+    raw_data, original_df = get_leaderboard_df(
+        results_path=EVAL_RESULTS_PATH,
+        requests_path=EVAL_REQUESTS_PATH,
+        dynamic_path=DYNAMIC_INFO_FILE_PATH,
+        cols=COLS,
+        benchmark_cols=BENCHMARK_COLS
     )
+    update_collections(original_df.copy())
+    leaderboard_df = original_df.copy()
+    plot_df = create_plot_df(create_scores_df(raw_data))
+    (
+        finished_eval_queue_df,
+        running_eval_queue_df,
+        pending_eval_queue_df,
+    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
+leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 # Searching and filtering
     precision_query: str,
     size_query: list,
     show_deleted: bool,
+    show_merges: bool,
+    show_moe: bool,
     show_flagged: bool,
     query: str,
 ):
+    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_merges, show_moe, show_flagged)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     return df
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+    dummy_col = [AutoEvalColumn.dummy.name]
+        #AutoEvalColumn.model_type_symbol.name,
+        #AutoEvalColumn.model.name,
     # We use COLS to maintain sorting
     filtered_df = df[
+        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
     ]
     return filtered_df
 def filter_models(
+    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_merges: bool, show_moe:bool, show_flagged: bool
 ) -> pd.DataFrame:
     # Show all models
     if show_deleted:
     else:  # Show only still on the hub models
         filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
+    if not show_merges:
+        filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
+    if not show_moe:
+        filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
     if not show_flagged:
         filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
     return filtered_df
+leaderboard_df = filter_models(
+    df=leaderboard_df,
+    type_query=[t.to_str(" : ") for t in ModelType],
+    size_query=list(NUMERIC_INTERVALS.keys()),
+    precision_query=[i.value.name for i in Precision],
+    show_deleted=False,
+    show_merges=False,
+    show_moe=True,
+    show_flagged=False
+)
 import unicodedata

src/display/about.py CHANGED Viewed

@@ -159,10 +159,13 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
 ### 4) Fill up your model card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 ### 4) Fill up your model card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+### 5) Select the correct precision
+Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the command in the About tab under "Reproducibility" with all arguments specified (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/display/css_html_js.py CHANGED Viewed

@@ -1,5 +1,24 @@
 custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
@@ -21,14 +40,6 @@ custom_css = """
     transform: scale(1.3);
 }
-#leaderboard-table {
-    margin-top: 15px
-}
-#leaderboard-table-lite {
-    margin-top: 15px
-}
 #search-bar-table-box > div:first-child {
     background: none;
     border: none;
@@ -38,36 +49,11 @@ custom_css = """
     padding: 0px;
 }
-/* Hides the final AutoEvalColumn */
-#llm-benchmark-tab-table table td:last-child,
-#llm-benchmark-tab-table table th:last-child {
-    display: none;
-}
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-table td:first-child,
-table th:first-child {
-    max-width: 400px;
-    overflow: auto;
-    white-space: nowrap;
-}
 .tab-buttons button {
     font-size: 20px;
 }
-#scale-logo {
-    border-style: none !important;
-    box-shadow: none;
-    display: block;
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 600px;
-}
-#scale-logo .download {
-    display: none;
-}
 #filter_type{
     border: 0;
     padding-left: 0;

 custom_css = """
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+/* Full width space */
+.gradio-container {
+  max-width: 95%!important;
+}
+/* Text style and margins */
 .markdown-text {
     font-size: 16px !important;
 }
     transform: scale(1.3);
 }
 #search-bar-table-box > div:first-child {
     background: none;
     border: none;
     padding: 0px;
 }
 .tab-buttons button {
     font-size: 20px;
 }
+/* Filters style */
 #filter_type{
     border: 0;
     padding-left: 0;

src/display/utils.py CHANGED Viewed

@@ -38,7 +38,7 @@ auto_eval_column_dict = []
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
@@ -46,13 +46,14 @@ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type",
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["merge", ColumnContent, ColumnContent("Merged", "bool", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
@@ -73,8 +74,8 @@ baseline_row = {
     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
     AutoEvalColumn.average.name: 31.0,
-    AutoEvalColumn.merge.name: False,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
@@ -98,8 +99,8 @@ human_baseline_row = {
     AutoEvalColumn.model.name: "<p>Human performance</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.merge.name: False,
     AutoEvalColumn.average.name: 92.75,
     AutoEvalColumn.arc.name: 80.0,
     AutoEvalColumn.hellaswag.name: 95.0,
     AutoEvalColumn.mmlu.name: 89.8,
@@ -108,6 +109,7 @@ human_baseline_row = {
     AutoEvalColumn.gsm8k.name: 100,
     AutoEvalColumn.dummy.name: "human_baseline",
     AutoEvalColumn.model_type.name: "",
 }
 @dataclass
@@ -168,10 +170,8 @@ class Precision(Enum):
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
-COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
-TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
+auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
+    AutoEvalColumn.merged.name: False,
     AutoEvalColumn.average.name: 31.0,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.model.name: "<p>Human performance</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
     AutoEvalColumn.average.name: 92.75,
+    AutoEvalColumn.merged.name: False,
     AutoEvalColumn.arc.name: 80.0,
     AutoEvalColumn.hellaswag.name: 95.0,
     AutoEvalColumn.mmlu.name: 89.8,
     AutoEvalColumn.gsm8k.name: 100,
     AutoEvalColumn.dummy.name: "human_baseline",
     AutoEvalColumn.model_type.name: "",
+    AutoEvalColumn.flagged.name: False,
 }
 @dataclass
 # Column selection
+COLS = [c.name for c in fields(AutoEvalColumn)]
+TYPES = [c.type for c in fields(AutoEvalColumn)]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

src/envs.py CHANGED Viewed

@@ -7,6 +7,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
 REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
 RESULTS_REPO = "open-llm-leaderboard/results"
 PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
@@ -18,6 +19,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"

 REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
 QUEUE_REPO = "open-llm-leaderboard/requests"
+DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
 RESULTS_REPO = "open-llm-leaderboard/results"
 PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+DYNAMIC_INFO_PATH = os.path.join(CACHE_PATH, "dynamic-info")
+DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"

src/leaderboard/filter_models.py CHANGED Viewed

@@ -4,6 +4,7 @@ from src.display.utils import AutoEvalColumn
 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
 FLAGGED_MODELS = {
     "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
     "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
     "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
@@ -38,7 +39,49 @@ FLAGGED_MODELS = {
     "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
 }
 # Models which have been requested by orgs to not be submitted on the leaderboard
@@ -52,10 +95,16 @@ DO_NOT_SUBMIT_MODELS = [
 def flag_models(leaderboard_data: list[dict]):
     for model_data in leaderboard_data:
-        if model_data["model_name_for_query"] in FLAGGED_MODELS:
-            issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
             issue_link = model_hyperlink(
-                FLAGGED_MODELS[model_data["model_name_for_query"]],
                 f"See discussion #{issue_num}",
             )
             model_data[

 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
 FLAGGED_MODELS = {
+    "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
     "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
     "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
     "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
     "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
+    "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
+    "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
+    "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
+    "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
+    # Merges not indicated
+    "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
+    "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
+    "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
+    "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
+    "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
 }
 # Models which have been requested by orgs to not be submitted on the leaderboard
 def flag_models(leaderboard_data: list[dict]):
     for model_data in leaderboard_data:
+        # Merges and moes are flagged automatically
+        if model_data[AutoEvalColumn.flagged.name] == True:
+            flag_key = "merged"
+        else:
+            flag_key = model_data["model_name_for_query"]
+        if flag_key in FLAGGED_MODELS:
+            issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
             issue_link = model_hyperlink(
+                FLAGGED_MODELS[flag_key],
                 f"See discussion #{issue_num}",
             )
             model_data[

src/leaderboard/read_evals.py CHANGED Viewed

@@ -5,15 +5,12 @@ import os
 from dataclasses import dataclass
 import dateutil
-from datetime import datetime
-from transformers import AutoConfig
 import numpy as np
 from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
 @dataclass
@@ -33,8 +30,11 @@ class EvalResult:
     likes: int = 0
     num_params: int = 0
     date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    merge: bool = False
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -43,13 +43,13 @@ class EvalResult:
             data = json.load(fp)
         # We manage the legacy config format
-        config = data.get("config", data.get("config_general", None))
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
@@ -62,20 +62,6 @@ class EvalResult:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
-        try:
-            merge = any(t in ["merge", "mergedlm"] for t in ModelCard.load(full_model).data.tags)
-        except Exception:
-            merge = False
-        still_on_hub, error, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
@@ -112,9 +98,6 @@ class EvalResult:
             results=results,
             precision=precision,
             revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture,
-            merge=merge
         )
     def update_with_request_file(self, requests_path):
@@ -124,15 +107,24 @@ class EvalResult:
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
-        except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -140,7 +132,6 @@ class EvalResult:
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.merge.name: self.merge,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
@@ -152,6 +143,9 @@ class EvalResult:
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks:
@@ -182,7 +176,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -200,11 +194,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
@@ -216,8 +215,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
         except KeyError:  # not all eval values present
             continue

 from dataclasses import dataclass
 import dateutil
 import numpy as np
 from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 @dataclass
     likes: int = 0
     num_params: int = 0
     date: str = "" # submission date of request file
+    still_on_hub: bool = True
+    is_merge: bool = False
+    flagged: bool = False
+    status: str = "FINISHED"
+    tags: list = None
     @classmethod
     def init_from_json_file(self, json_filepath):
             data = json.load(fp)
         # We manage the legacy config format
+        config = data.get("config_general")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
+        org_and_model = config.get("model_name")
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
             result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
             results=results,
             precision=precision,
             revision= config.get("model_sha", ""),
         )
     def update_with_request_file(self, requests_path):
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
+            self.architecture = request.get("architectures", "Unknown")
+            self.status = request.get("status", "FAILED")
+        except Exception as e:
+            self.status = "FAILED"
             print(f"Could not find request file for {self.org}/{self.model}")
+    def update_with_dynamic_file_dict(self, file_dict):
+        self.license = file_dict.get("license", "?")
+        self.likes = file_dict.get("likes", 0)
+        self.still_on_hub = file_dict["still_on_hub"]
+        self.flagged = any("flagged" in tag for tag in file_dict["tags"])
+        self.tags = file_dict["tags"]
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
+            AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
+            AutoEvalColumn.flagged.name: self.flagged
         }
         for task in Tasks:
     return request_file
+def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    with open(dynamic_path) as f:
+        dynamic_data = json.load(f)
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
+        if eval_result.full_model in dynamic_data:
+            eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
         # Store results of same eval together
         eval_name = eval_result.eval_name
     results = []
     for v in eval_results.values():
         try:
+            if v.status == "FINISHED":
+                v.to_dict() # we test if the dict version is complete
+                results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -9,8 +9,8 @@ from src.leaderboard.filter_models import filter_models
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)
     filter_models(all_data_json)

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)
     filter_models(all_data_json)

{scripts → src/scripts}/create_request_file.py RENAMED Viewed

@@ -1,36 +1,21 @@
 import json
 import os
 import pprint
-import re
 from datetime import datetime, timezone
 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
-model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
-weight_types = ("Original", "Delta", "Adapter")
-def get_model_size(model_info, precision: str):
-    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
-    try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
-        try:
-            size_match = re.search(size_pattern, model_info.modelId.lower())
-            model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
-        except AttributeError:
-            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
-    model_size = size_factor * model_size
-    return model_size
 def main():

 import json
 import os
 import pprint
 from datetime import datetime, timezone
 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
+from src.submission.check_validity import get_model_size
+from src.display.utils import ModelType, WeightType
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"
 precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
+model_types = [e.name for e in ModelType]
+weight_types = [e.name for e in WeightType]
 def main():

src/scripts/update_all_request_files.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from huggingface_hub import ModelFilter, snapshot_download
+from huggingface_hub import ModelCard
+import json
+import time
+from src.submission.check_validity import is_model_on_hub, check_model_card
+from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
+def update_models(file_path, models):
+    """
+    Search through all JSON files in the specified root folder and its subfolders,
+    and update the likes key in JSON dict from value of input dict
+    """
+    with open(file_path, "r") as f:
+        model_infos = json.load(f)
+        for model_id, data in model_infos.items():
+            if model_id not in models:
+                data['still_on_hub'] = False
+                data['likes'] = 0
+                data['downloads'] = 0
+                data['created_at'] = ""
+                continue
+            model_cfg = models[model_id]
+            data['likes'] = model_cfg.likes
+            data['downloads'] = model_cfg.downloads
+            data['created_at'] = str(model_cfg.created_at)
+            #data['params'] = get_model_size(model_cfg, data['precision'])
+            data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
+            # Is the model still on the hub
+            still_on_hub, error, model_config = is_model_on_hub(
+                model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
+            )
+            # If the model doesn't have a model card or a license, we consider it's deleted
+            if still_on_hub:
+                try:
+                    if check_model_card(model_id)[0] is False:
+                        still_on_hub = False
+                except Exception:
+                    still_on_hub = False
+            data['still_on_hub'] = still_on_hub
+            #  Check if the model is a merge
+            is_merge_from_metadata = False
+            is_moe_from_metadata = False
+            if still_on_hub:
+                model_card = ModelCard.load(model_id)
+                # Storing the model metadata
+                tags = []
+                if model_card.data.tags:
+                    is_merge_from_metadata = "merge" in model_card.data.tags
+                    is_moe_from_metadata = "moe" in model_card.data.tags
+                merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
+                # If the model is a merge but not saying it in the metadata, we flag it
+                is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
+                if is_merge_from_model_card or is_merge_from_metadata:
+                    tags.append("merge")
+                    if not is_merge_from_metadata:
+                        tags.append("flagged:undisclosed_merge")
+                moe_keywords = ["moe", "mixture of experts", "mixtral"]
+                is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
+                is_moe_from_name = "moe" in model_id.lower().replace("/", "-").replace("_", "-").split("-")
+                if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
+                    tags.append("moe")
+                    if not is_moe_from_metadata:
+                        tags.append("flagged:undisclosed_moe")
+            data["tags"] = tags
+    with open(file_path, 'w') as f:
+        json.dump(model_infos, f, indent=2)
+def update_dynamic_files():
+    """ This will only update metadata for models already linked in the repo, not add missing ones.
+    """
+    snapshot_download(
+        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+    print("UPDATE_DYNAMIC: Loaded snapshot")
+    # Get models
+    start = time.time()
+    models = list(API.list_models(
+        filter=ModelFilter(task="text-generation"),
+        full=False,
+        cardData=True,
+        fetch_config=True,
+    ))
+    id_to_model = {model.id : model for model in models}
+    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
+    start = time.time()
+    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
+    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
+    API.upload_file(
+        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
+        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
+        repo_id=DYNAMIC_INFO_REPO,
+        repo_type="dataset",
+        commit_message=f"Daily request file update.",
+    )
+    print(f"UPDATE_DYNAMIC: pushed to hub")

src/submission/check_validity.py CHANGED Viewed

@@ -6,9 +6,8 @@ from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
-from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig, AutoTokenizer
-from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
 from src.envs import HAS_HIGHER_RATE_LIMIT
@@ -37,9 +36,9 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
                 tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
@@ -53,7 +52,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
-    except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
@@ -65,18 +64,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
 def get_model_size(model_info: ModelInfo, precision: str):
-    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError ):
         try:
-            size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
             model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
-        except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
     model_size = size_factor * model_size
     return model_size

 import huggingface_hub
 from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
 from transformers import AutoConfig, AutoTokenizer
 from src.envs import HAS_HIGHER_RATE_LIMIT
     return True, ""
+def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
     try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
         if test_tokenizer:
             try:
                 tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
                 return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
+    except ValueError as e:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
 def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
+    safetensors = None
     try:
+        safetensors = get_safetensors_metadata(model_info.id)
+    except Exception as e:
+        print(e)
+    if safetensors is not None:
+        model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
+    else:
         try:
+            size_match = re.search(size_pattern, model_info.id.lower())
             model_size = size_match.group(0)
             model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError as e:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
     model_size = size_factor * model_size
     return model_size

src/submission/submit.py CHANGED Viewed

@@ -2,8 +2,10 @@ import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
     already_submitted_models,
@@ -64,10 +66,21 @@ def add_new_eval(
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
@@ -86,6 +99,31 @@ def add_new_eval(
     modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
@@ -96,13 +134,23 @@ def add_new_eval(
         "revision": revision,
         "private": private,
         "precision": precision,
         "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
         "likes": model_info.likes,
-        "params": model_size,
         "license": license,
     }
     # Check for duplicate submission
@@ -126,6 +174,28 @@ def add_new_eval(
         commit_message=f"Add {model} to eval queue",
     )
     # Remove the local file
     os.remove(out_path)

 import os
 from datetime import datetime, timezone
+from huggingface_hub import ModelCard, snapshot_download
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
     already_submitted_models,
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
+    architecture = "?"
+    downloads = 0
+    created_at = ""
     if not weight_type == "Adapter":
+        model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+            downloads = getattr(model_config, 'downloads', 0)
+            created_at = getattr(model_config, 'created_at', '')
     # Is the model info correctly filled?
     try:
     modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
+    is_merge_from_metadata = False
+    is_moe_from_metadata = False
+    model_card = ModelCard.load(model)
+    # Storing the model tags
+    tags = []
+    if model_card.data.tags:
+        is_merge_from_metadata = "merge" in model_card.data.tags
+        is_moe_from_metadata = "moe" in model_card.data.tags
+    merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
+    # If the model is a merge but not saying it in the metadata, we flag it
+    is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
+    if is_merge_from_model_card or is_merge_from_metadata:
+        tags.append("merge")
+        if not is_merge_from_metadata:
+            tags.append("flagged:undisclosed_merge")
+    moe_keywords = ["moe", "mixture of experts", "mixtral"]
+    is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
+    is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
+    if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
+        tags.append("moe")
+        if not is_moe_from_metadata:
+            tags.append("flagged:undisclosed_moe")
     # Seems good, creating the eval
     print("Adding new eval")
         "revision": revision,
         "private": private,
         "precision": precision,
+        "params": model_size,
+        "architectures": architecture,
         "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
+        "job_id": -1,
+        "job_start_time": None,
+    }
+    supplementary_info = {
         "likes": model_info.likes,
         "license": license,
+        "still_on_hub": True,
+        "tags": tags,
+        "downloads": downloads,
+        "created_at": created_at
     }
     # Check for duplicate submission
         commit_message=f"Add {model} to eval queue",
     )
+    # We want to grab the latest version of the submission file to not accidentally overwrite it
+    snapshot_download(
+        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+    with open(DYNAMIC_INFO_FILE_PATH) as f:
+        all_supplementary_info = json.load(f)
+    all_supplementary_info[model] = supplementary_info
+    with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
+        json.dump(all_supplementary_info, f, indent=2)
+    API.upload_file(
+        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
+        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
+        repo_id=DYNAMIC_INFO_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model} to dynamic info queue",
+    )
     # Remove the local file
     os.remove(out_path)