Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1102

chriscanal commited on Oct 17, 2023

Commit

8e47868

1 Parent(s): 75297e7

Updated app.py to fix conflict and changed name of tab per Clémentine Fourrier's request

Browse files

Files changed (1) hide show

app.py +113 -65

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import json
 import os
 from datetime import datetime, timezone
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import HfApi
 from src.assets.css_html_js import custom_css, get_window_url_params
 from src.assets.text_content import (
@@ -24,6 +25,7 @@ from src.display_models.plot_results import (
     HUMAN_BASELINES,
 )
 from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
 from src.display_models.utils import (
     AutoEvalColumn,
     EvalQueueColumn,
@@ -32,7 +34,8 @@ from src.display_models.utils import (
     styled_message,
     styled_warning,
 )
-from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
 from src.rate_limiting import user_submission_permission
 pd.set_option("display.precision", 1)
@@ -60,6 +63,7 @@ api = HfApi(token=H4_TOKEN)
 def restart_space():
     api.restart_space(repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN)
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
@@ -87,39 +91,23 @@ BENCHMARK_COLS = [
     ]
 ]
-## LOAD INFO FROM HUB
-eval_queue, requested_models, eval_results, users_to_submission_dates = load_all_info_from_hub(
-    QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
-)
-if not IS_PUBLIC:
-    (eval_queue_private, requested_models_private, eval_results_private, _) = load_all_info_from_hub(
-        PRIVATE_QUEUE_REPO,
-        PRIVATE_RESULTS_REPO,
-        EVAL_REQUESTS_PATH_PRIVATE,
-        EVAL_RESULTS_PATH_PRIVATE,
-    )
-else:
-    eval_queue_private, eval_results_private = None, None
-original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
-models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
 plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
 to_be_dumped = f"models = {repr(models)}\n"
-# with open("models_backlinks.py", "w") as f:
-#     f.write(to_be_dumped)
-# print(to_be_dumped)
-leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
-) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
-print(leaderboard_df["Precision"].unique())
 ## INTERACTION FUNCTIONS
@@ -135,18 +123,25 @@ def add_new_eval(
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
     if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
         error_msg = f"Organisation or user `{model.split('/')[0]}`"
         error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
         error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
-        error_msg += "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
         return styled_error(error_msg)
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # check the model actually exists before adding the eval
     if revision == "":
         revision = "main"
@@ -160,7 +155,34 @@ def add_new_eval(
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
-    print("adding new eval")
     eval_entry = {
         "model": model,
@@ -172,6 +194,9 @@ def add_new_eval(
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
     }
     user_name = ""
@@ -180,14 +205,11 @@ def add_new_eval(
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
-    # Check if the model has been forbidden:
-    if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
-        return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in requested_models:
         return styled_warning("This model has been already submitted.")
@@ -195,6 +217,7 @@ def add_new_eval(
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     api.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
@@ -203,7 +226,7 @@ def add_new_eval(
         commit_message=f"Add {model} to eval queue",
     )
-    # remove the local file
     os.remove(out_path)
     return styled_message(
@@ -223,17 +246,25 @@ def change_tab(query_param: str):
 # Searching and filtering
-def update_table(hidden_df: pd.DataFrame, current_columns_df: pd.DataFrame, columns: list, type_query: list, precision_query: str, size_query: list, show_deleted: bool, query: str):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
-    if query != "":
-        filtered_df = search_table(filtered_df, query)
     df = select_columns(filtered_df, columns)
     return df
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
@@ -245,16 +276,39 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     ]
     return filtered_df
 NUMERIC_INTERVALS = {
-    "Unknown": pd.Interval(-1, 0, closed="right"),
-    "< 1.5B": pd.Interval(0, 1.5, closed="right"),
-    "~3B": pd.Interval(1.5, 5, closed="right"),
-    "~7B": pd.Interval(6, 11, closed="right"),
-    "~13B": pd.Interval(12, 15, closed="right"),
-    "~35B": pd.Interval(16, 55, closed="right"),
-    "60B+": pd.Interval(55, 10000, closed="right"),
 }
 def filter_models(
     df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
@@ -266,7 +320,7 @@ def filter_models(
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-    filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query)]
     numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -287,7 +341,7 @@ with demo:
                 with gr.Column():
                     with gr.Row():
                         search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model and press ENTER...",
                             show_label=False,
                             elem_id="search-bar",
                         )
@@ -332,12 +386,14 @@ with demo:
                                 ModelType.FT.to_str(),
                                 ModelType.IFT.to_str(),
                                 ModelType.RL.to_str(),
                             ],
                             value=[
                                 ModelType.PT.to_str(),
                                 ModelType.FT.to_str(),
                                 ModelType.IFT.to_str(),
                                 ModelType.RL.to_str(),
                             ],
                             interactive=True,
                             elem_id="filter-columns-type",
@@ -350,12 +406,13 @@ with demo:
                             elem_id="filter-columns-precision",
                         )
                         filter_columns_size = gr.CheckboxGroup(
-                            label="Model sizes",
                             choices=list(NUMERIC_INTERVALS.keys()),
                             value=list(NUMERIC_INTERVALS.keys()),
                             interactive=True,
                             elem_id="filter-columns-size",
                         )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
                     [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
@@ -387,7 +444,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -401,7 +457,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -416,7 +471,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -431,7 +485,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -446,7 +499,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -461,7 +513,6 @@ with demo:
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
-                    leaderboard_table,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
@@ -472,7 +523,8 @@ with demo:
                 leaderboard_table,
                 queue=True,
             )
-        with gr.TabItem("📈 Benchmark Graphs", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Row():
                 with gr.Column():
                     chart = create_metric_plot_obj(
@@ -556,13 +608,7 @@ with demo:
                 with gr.Column():
                     precision = gr.Dropdown(
-                        choices=[
-                            "float16",
-                            "bfloat16",
-                            "8bit (LLM.int8)",
-                            "4bit (QLoRA / FP4)",
-                            "GPTQ"
-                        ],
                         label="Precision",
                         multiselect=False,
                         value="float16",
@@ -598,8 +644,10 @@ with demo:
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id="citation-button",
-            ).style(show_copy_button=True)
     dummy = gr.Textbox(visible=False)
     demo.load(

 import json
 import os
+import re
 from datetime import datetime, timezone
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi, snapshot_download
 from src.assets.css_html_js import custom_css, get_window_url_params
 from src.assets.text_content import (
     HUMAN_BASELINES,
 )
 from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
+from src.display_models.modelcard_filter import check_model_card
 from src.display_models.utils import (
     AutoEvalColumn,
     EvalQueueColumn,
     styled_message,
     styled_warning,
 )
+from src.manage_collections import update_collections
+from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub
 from src.rate_limiting import user_submission_permission
 pd.set_option("display.precision", 1)
 def restart_space():
     api.restart_space(repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN)
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
     ]
 ]
+snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None)
+snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None)
+requested_models, users_to_submission_dates = get_all_requested_models(EVAL_REQUESTS_PATH)
+original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
+update_collections(original_df.copy())
+leaderboard_df = original_df.copy()
+models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
 plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
 to_be_dumped = f"models = {repr(models)}\n"
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 ## INTERACTION FUNCTIONS
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    if model_type is None or model_type == "":
+        return styled_error("Please select a model type.")
+    # Is the user rate limited?
     num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
     if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
         error_msg = f"Organisation or user `{model.split('/')[0]}`"
         error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
         error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
+        error_msg += (
+            "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
+        )
         return styled_error(error_msg)
+    # Did the model authors forbid its submission to the leaderboard?
+    if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
+        return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
+    # Does the model actually exist?
     if revision == "":
         revision = "main"
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
+    model_info = api.model_info(repo_id=model, revision=revision)
+    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except AttributeError:
+        try:
+            size_match = re.search(size_pattern, model.lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 65
+    size_factor = 8 if (precision == "GPTQ" or "GPTQ" in model) else 1
+    model_size = size_factor * model_size
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        license = "?"
+    # Were the model card and license filled?
+    modelcard_OK, error_msg = check_model_card(model)
+    if not modelcard_OK:
+        return styled_error(error_msg)
+    # Seems good, creating the eval
+    print("Adding new eval")
     eval_entry = {
         "model": model,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
     }
     user_name = ""
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
+    print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in requested_models:
         return styled_warning("This model has been already submitted.")
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
+    print("Uploading eval file")
     api.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
         commit_message=f"Add {model} to eval queue",
     )
+    # Remove the local file
     os.remove(out_path)
     return styled_message(
 # Searching and filtering
+def update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    query: str,
+):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
+    filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     return df
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
     ]
     return filtered_df
 NUMERIC_INTERVALS = {
+    "?": pd.Interval(-1, 0, closed="right"),
+    "0~1.5": pd.Interval(0, 1.5, closed="right"),
+    "1.5~3": pd.Interval(1.5, 3, closed="right"),
+    "3~7": pd.Interval(3, 7, closed="right"),
+    "7~13": pd.Interval(7, 13, closed="right"),
+    "13~35": pd.Interval(13, 35, closed="right"),
+    "35~60": pd.Interval(35, 60, closed="right"),
+    "60+": pd.Interval(60, 10000, closed="right"),
 }
+def filter_queries(query: str, filtered_df: pd.DataFrame):
+    """Added by Abishek"""
+    final_df = []
+    if query != "":
+        queries = [q.strip() for q in query.split(";")]
+        for _q in queries:
+            _q = _q.strip()
+            if _q != "":
+                temp_filtered_df = search_table(filtered_df, _q)
+                if len(temp_filtered_df) > 0:
+                    final_df.append(temp_filtered_df)
+        if len(final_df) > 0:
+            filtered_df = pd.concat(final_df)
+            filtered_df = filtered_df.drop_duplicates(
+                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
+            )
+    return filtered_df
 def filter_models(
     df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+    filtered_df = filtered_df[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
     numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
                 with gr.Column():
                     with gr.Row():
                         search_bar = gr.Textbox(
+                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
                             show_label=False,
                             elem_id="search-bar",
                         )
                                 ModelType.FT.to_str(),
                                 ModelType.IFT.to_str(),
                                 ModelType.RL.to_str(),
+                                ModelType.Unknown.to_str(),
                             ],
                             value=[
                                 ModelType.PT.to_str(),
                                 ModelType.FT.to_str(),
                                 ModelType.IFT.to_str(),
                                 ModelType.RL.to_str(),
+                                ModelType.Unknown.to_str(),
                             ],
                             interactive=True,
                             elem_id="filter-columns-type",
                             elem_id="filter-columns-precision",
                         )
                         filter_columns_size = gr.CheckboxGroup(
+                            label="Model sizes (in billions of parameters)",
                             choices=list(NUMERIC_INTERVALS.keys()),
                             value=list(NUMERIC_INTERVALS.keys()),
                             interactive=True,
                             elem_id="filter-columns-size",
                         )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
                     [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 update_table,
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                 leaderboard_table,
                 queue=True,
             )
+        with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Row():
                 with gr.Column():
                     chart = create_metric_plot_obj(
                 with gr.Column():
                     precision = gr.Dropdown(
+                        choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ"],
                         label="Precision",
                         multiselect=False,
                         value="float16",
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
+                lines=20,
                 elem_id="citation-button",
+                show_copy_button=True,
+            )
     dummy = gr.Textbox(visible=False)
     demo.load(