Spaces:

prometheus-eval
/

BiGGen-Bench-Leaderboard

Running

App Files Files Community

juyoungml commited on Jun 4, 2024

Commit

b1b6ed6

1 Parent(s): 9172f10

Init

Browse files

Files changed (12) hide show

app.py +157 -0
requirements.txt +5 -0
src/.gitignore +1 -0
src/__init__.py +0 -0
src/assets.py +61 -0
src/content.py +31 -0
src/leaderboard.py +218 -0
src/llm_perf.py +220 -0
src/model_card.py +160 -0
src/model_list.py +529 -0
src/panel.py +60 -0
src/utils.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import gradio as gr
+from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
+from src.assets import custom_css
+from src.content import ABOUT, BGB_LOGO, BGB_TITLE, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
+from src.leaderboard import (
+    BGB_COLUMN_MAPPING,
+    BGB_COLUMN_TO_DATATYPE,
+    CAPABILITY_COLUMNS,
+    create_bgb_leaderboard_table,
+    create_leaderboard_table,
+    get_bgb_leaderboard_df,
+)
+from src.llm_perf import get_eval_df, get_llm_perf_df
+from src.panel import create_select_callback
+BGB = True
+# prometheus-eval/prometheus-bgb-8x7b-v2.0
+# def init_leaderboard():
+#     machine = "1xA10"
+#     open_llm_perf_df = get_llm_perf_df(machine=machine)
+#     search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(open_llm_perf_df)
+#     return machine, search_bar, columns_checkboxes, leaderboard_table
+EVAL_MODELS = [
+    "gpt-4-turbo-2024-04-09",
+    "prometheus-bgb-8x7b-v2.0",
+]
+EVAL_MODEL_TABS = {
+    "gpt-4-turbo-2024-04-09": "GPT-4 as a Judge 🏅",
+    "prometheus-bgb-8x7b-v2.0": "Prometheus as a Judge 🏅",
+}
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(BGB_LOGO, elem_classes="logo")
+    gr.HTML(BGB_TITLE, elem_classes="title")
+    # gr.HTML(BGB_LOGO_AND_TITLE, elem_classes="title")
+    with gr.Tabs(elem_classes="tabs"):
+        for idx, eval_model in enumerate(EVAL_MODELS):
+            tab_name = EVAL_MODEL_TABS[eval_model]
+            # Previous code without gradio_leaderboard
+            # machine = eval_model
+            # machine_textbox = gr.Textbox(value=eval_model, visible=False)
+            # if BGB:
+            #     eval_df = get_eval_df(eval_model_name=eval_model)
+            # else:
+            #     eval_df = get_llm_perf_df(machine=machine)
+            # # Leaderboard
+            # with gr.TabItem(tab_name, id=idx):
+            #     if BGB:
+            #         search_bar, columns_checkboxes, type_checkboxes, param_slider, leaderboard_table = create_bgb_leaderboard_table(eval_df)
+            #     else:
+            #         search_bar, columns_checkboxes, type_checkboxes, param_slider, leaderboard_table = (
+            #             create_leaderboard_table(eval_df)
+            #         )
+            # create_select_callback(
+            #     # inputs
+            #     machine_textbox,
+            #     # interactive
+            #     columns_checkboxes,
+            #     search_bar,
+            #     type_checkboxes,
+            #     param_slider,
+            #     # outputs
+            #     leaderboard_table,
+            # )
+            with gr.TabItem(tab_name, id=idx):
+                eval_df = get_eval_df(eval_model_name=eval_model)
+                eval_df = get_bgb_leaderboard_df(eval_df)
+                ordered_columns = [
+                    "Model 🤗",
+                    "Average",
+                    "Grounding ⚡️",
+                    "Instruction Following 📝",
+                    "Planning 📅",
+                    "Reasoning 💡",
+                    "Refinement 🔩",
+                    "Safety ⚠️",
+                    "Theory of Mind 🤔",
+                    "Tool Usage 🛠️",
+                    "Multilingual 🇬🇫",
+                    "Model Type",
+                    "Model Params (B)",
+                ]
+                ordered_columns_types = [
+                    "markdown",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "number",
+                    "text",
+                    "number",
+                ]
+                eval_df = eval_df[ordered_columns]
+                Leaderboard(
+                    value=eval_df,
+                    datatype=ordered_columns_types,
+                    select_columns=SelectColumns(
+                        default_selection=ordered_columns,
+                        cant_deselect=["Model 🤗", "Model Type", "Model Params (B)"],
+                        label="Select Columns to Display:",
+                    ),
+                    search_columns=["Model 🤗"],
+                    # hide_columns=["model_name_for_query", "Model Size"],
+                    filter_columns=[
+                        ColumnFilter("Model Type", type="checkboxgroup", label="Model types"),
+                        ColumnFilter(
+                            "Model Params (B)",
+                            min=0,
+                            max=150,
+                            default=[0, 150],
+                            type="slider",
+                            label="Model Params (B)",
+                        ),
+                    ],
+                )
+        ####################### ABOUT TAB #######################
+        with gr.TabItem("About 📖", id=3):
+            gr.Markdown(ABOUT, elem_classes="descriptive-text")
+    ####################### CITATION
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+if __name__ == "__main__":
+    # Launch demo
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+huggingface_hub
+transformers
+gradio
+plotly
+pandas

src/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

src/__init__.py ADDED Viewed

File without changes

src/assets.py ADDED Viewed

	@@ -0,0 +1,61 @@

+custom_css = """
+.logo {
+    width: 300px;
+    height: auto;
+    margin: 0 auto;
+    max-width: 100%
+    object-fit: contain;
+}
+.text {
+    font-size: 16px !important;
+}
+.tabs button {
+    font-size: 20px;
+}
+.subtabs button {
+    font-size: 20px;
+}
+.descriptive-text span {
+    font-size: 16px !important;
+}
+#control-panel span {
+    font-size: 20px !important;
+}
+#search-bar span {
+    font-size: 16px !important;
+}
+#threshold-slider span {
+    font-size: 16px !important;
+}
+#memory-slider span {
+    font-size: 16px !important;
+}
+#columns-checkboxes span {
+    font-size: 16px !important;
+}
+#backend-checkboxes span {
+    font-size: 16px !important;
+}
+#dtype-checkboxes span {
+    font-size: 16px !important;
+}
+#optimization-checkboxes span {
+    font-size: 16px !important;
+}
+#quantization-checkboxes span {
+    font-size: 16px !important;
+}
+#kernel-checkboxes span {
+    font-size: 16px !important;
+}
+#leaderboard-table td:first-child,
+#leaderboard-table th:first-child {
+    max-width: 300px;
+    overflow: auto;
+    white-space: nowrap;
+}
+"""

src/content.py ADDED Viewed

	@@ -0,0 +1,31 @@

+LOGO = '<img src="https://raw.githubusercontent.com/prometheus-eval/leaderboard/main/logo.png">'
+TITLE = """<h1 align="center" id="space-title">🤗 BiGGen-Bench Leaderboard 🏋️</h1>"""
+BGB_LOGO = '<img src="https://raw.githubusercontent.com/prometheus-eval/leaderboard/main/logo.png" alt="Logo" style="width: 30%; display: block; margin: auto;">'
+BGB_TITLE = """<h1 align="center">BiGGen-Bench Leaderboard</h1>"""
+ABOUT = """
+## 📝 About
+### BiGGen-Bench Leaderboard
+Welcome to the 🌟 BiGGen-Bench Leaderboard 🚀, a dedicated benchmarking platform designed to evaluate the nuanced capabilities of Generative Language Models (GLMs) across a variety of complex and diverse tasks. Leveraging the refined methodologies of [BiGGen-Bench](https://github.com/prometheus-eval/prometheus-eval), our leaderboard offers a comprehensive assessment framework that mirrors human-like discernment and precision in evaluating language models.
+#### Evaluation Details
+- **Evaluation Scope**: Covers nine key capabilities of GLMs across 77 tasks, with 765 unique instances tailored to test specific aspects of model performance.
+- **Scoring System**: Utilizes a detailed scoring rubric from 1 to 5, reflecting a range of outcomes based on instance-specific criteria closely aligned with the nuanced requirements of each task.
+- **Hardware and Setup**: Benchmarks are conducted using a controlled setup to ensure consistent and fair comparison across different models.
+- **Transparency and Openness**: All codes, data, and detailed evaluation results are publicly available to foster transparency and enable community-driven enhancements and verifications.
+#### Benchmarking Script
+All benchmarks are executed using the provided [code](https://github.com/prometheus-eval/prometheus-eval/blob/main/BiGGen-Bench) within the BiGGen-Bench repository. This script ensures that all models are evaluated under identical conditions, guaranteeing reliability and reproducibility of results.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
+CITATION_BUTTON = r"""TBA
+"""

src/leaderboard.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import gradio as gr
+from src.utils import model_hyperlink, process_score
+LEADERBOARD_COLUMN_TO_DATATYPE = {
+    # open llm
+    "Model 🤗": "markdown",
+    "Experiment 🧪": "str",
+    # primary measurements
+    "Prefill (s)": "number",
+    "Decode (tokens/s)": "number",
+    "Memory (MB)": "number",
+    "Energy (tokens/kWh)": "number",
+    # deployment settings
+    "Backend 🏭": "str",
+    "Precision 📥": "str",
+    "Quantization 🗜️": "str",
+    "Attention 👁️": "str",
+    "Kernel ⚛️": "str",
+    # additional measurements
+    # "Reserved Memory (MB)": "number",
+    # "Used Memory (MB)": "number",
+    "Open LLM Score (%)": "number",
+    "End-to-End (s)": "number",
+    "Architecture 🏛️": "str",
+    "Params (B)": "number",
+}
+PRIMARY_COLUMNS = [
+    "Model 🤗",
+    "Experiment 🧪",
+    "Prefill (s)",
+    "Decode (tokens/s)",
+    "Memory (MB)",
+    "Energy (tokens/kWh)",
+    "Open LLM Score (%)",
+]
+CAPABILITY_COLUMNS = [
+    "Grounding ⚡️",
+    "Instruction Following 📝",
+    "Planning 📅",
+    "Reasoning 💡",
+    "Refinement 🔩",
+    "Safety ⚠️",
+    "Theory of Mind 🤔",
+    "Tool Usage 🛠️",
+    "Multilingual 🇬🇫",
+]
+BGB_COLUMN_MAPPING = {
+    "model_name_or_path": "Model 🤗",
+    "average": "Average",
+    "grounding": "Grounding ⚡️",
+    "instruction_following": "Instruction Following 📝",
+    "planning": "Planning 📅",
+    "reasoning": "Reasoning 💡",
+    "refinement": "Refinement 🔩",
+    "safety": "Safety ⚠️",
+    "theory_of_mind": "Theory of Mind 🤔",
+    "tool_usage": "Tool Usage 🛠️",
+    "multilingual": "Multilingual 🇬🇫",
+    "model_params": "Model Params (B)",
+    "model_type": "Model Type",
+}
+BGB_COLUMN_TO_DATATYPE = {
+    "Model 🤗": "markdown",
+    "Average": "number",
+    "Grounding ⚡️": "number",
+    "Instruction Following 📝": "number",
+    "Planning 📅": "number",
+    "Reasoning 💡": "number",
+    "Refinement 🔩": "number",
+    "Safety ⚠️": "number",
+    "Theory of Mind 🤔": "number",
+    "Tool Usage 🛠️": "number",
+    "Multilingual 🇬🇫": "number",
+    "Model Params (B)": "number",
+    "Model Type": "str",
+}
+def process_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+# TODO: Process base, chat, proprietary models differently
+def process_bgb_model(row):
+    model_name = row.iloc[0]
+    model_type = row.iloc[1]
+    if model_type == "Base" or model_type == "Chat":
+        link = f"https://huggingface.co/{model_name}"
+        return model_hyperlink(link, model_name)
+    elif model_type == "Proprietary":
+        api_model_2_link = {
+            "gpt-3.5-turbo-1106": "https://platform.openai.com/docs/models/gpt-3-5",
+            "gpt-3.5-turbo-0125": "https://platform.openai.com/docs/models/gpt-3-5",
+            "gpt-4-0125-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday",
+            "gpt-4-1106-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday",
+            "gpt-4-turbo-2024-04-09": "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
+            "gpt-4o-2024-05-13": "https://openai.com/index/hello-gpt-4o/",
+            "claude-3-haiku-20240307": "https://www.anthropic.com/news/claude-3-family",
+            "claude-3-opus-20240229": "https://www.anthropic.com/news/claude-3-family",
+            "claude-3-sonnet-20240229": "https://www.anthropic.com/news/claude-3-family",
+            "mistral-large": "https://mistral.ai/news/mistral-large/",
+            "mistral-medium": "https://mistral.ai/news/la-plateforme/",
+            "gemini-1.0-pro": "https://deepmind.google/technologies/gemini/pro/",
+            "gemini-pro-1.5": "https://deepmind.google/technologies/gemini/pro/",
+            "google/gemini-flash-1.5": "https://deepmind.google/technologies/gemini/flash/",
+        }
+        link = api_model_2_link[model_name]
+        return model_hyperlink(link, model_name)
+    else:
+        raise NotImplementedError(f"Model type {model_type} not implemented")
+def get_leaderboard_df(llm_perf_df):
+    df = llm_perf_df.copy()
+    # transform for leaderboard
+    df["Model 🤗"] = df["Model 🤗"].apply(process_bgb_model)
+    # process quantization for leaderboard
+    df["Open LLM Score (%)"] = df.apply(lambda x: process_score(x["Open LLM Score (%)"], x["Quantization 🗜️"]), axis=1)
+    return df
+def get_bgb_leaderboard_df(eval_df):
+    df = eval_df.copy()
+    # transform for leaderboard
+    df["Model 🤗"] = df[["Model 🤗", "Model Type"]].apply(process_bgb_model, axis=1)
+    return df
+def create_leaderboard_table(llm_perf_df):
+    # get dataframe
+    leaderboard_df = get_leaderboard_df(llm_perf_df)
+    # create search bar
+    with gr.Row():
+        search_bar = gr.Textbox(
+            label="Model 🤗",
+            info="🔍 Search for a model name",
+            elem_id="search-bar",
+        )
+    # create checkboxes
+    with gr.Row():
+        columns_checkboxes = gr.CheckboxGroup(
+            label="Columns 📊",
+            value=PRIMARY_COLUMNS,
+            choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
+            info="☑️ Select the columns to display",
+            elem_id="columns-checkboxes",
+        )
+    # create table
+    leaderboard_table = gr.components.Dataframe(
+        value=leaderboard_df[PRIMARY_COLUMNS],
+        datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
+        headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
+        elem_id="leaderboard-table",
+    )
+    return search_bar, columns_checkboxes, leaderboard_table
+def create_bgb_leaderboard_table(eval_df):
+    # get dataframe
+    bgb_leaderboard_df = get_bgb_leaderboard_df(eval_df)
+    # create search bar
+    with gr.Row():
+        search_bar = gr.Textbox(
+            label="Model 🤗",
+            info="🔍 Search for a model name",
+            elem_id="search-bar",
+        )
+    with gr.Row():
+        type_checkboxes = gr.CheckboxGroup(
+            label="Model Type",
+            value=["Base", "Chat", "Proprietary"],
+            choices=["Base", "Chat", "Proprietary"],
+            info="☑️ Select the capabilities to display",
+            elem_id="type-checkboxes",
+        )
+    with gr.Row():
+        param_slider = gr.Slider(
+            minimum=0, maximum=150, value=7, step=1, interactive=True, label="Model Params (B)", elem_id="param-slider"
+        )
+    # create checkboxes
+    with gr.Row():
+        columns_checkboxes = gr.CheckboxGroup(
+            label="Capabilities 📊",
+            value=CAPABILITY_COLUMNS,
+            choices=CAPABILITY_COLUMNS,
+            info="☑️ Select the capabilities to display",
+            elem_id="columns-checkboxes",
+        )
+    # create table
+    bgb_leaderboard_table = gr.components.Dataframe(
+        value=bgb_leaderboard_df[list(BGB_COLUMN_MAPPING.values())],
+        datatype=list(BGB_COLUMN_TO_DATATYPE.values()),
+        headers=list(BGB_COLUMN_MAPPING.keys()),
+        elem_id="leaderboard-table",
+    )
+    return search_bar, columns_checkboxes, type_checkboxes, param_slider, bgb_leaderboard_table

src/llm_perf.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+from pathlib import Path
+import pandas as pd
+from src.model_list import MODEL_MAPPING, MODEL_SHORT_TO_LONG, get_all_model_list
+from src.utils import process_kernels, process_quantizations
+COLUMNS_MAPPING = {
+    "config.name": "Experiment 🧪",
+    "config.backend.model": "Model 🤗",
+    # primary measurements
+    "report.prefill.latency.p50": "Prefill (s)",
+    "report.per_token.latency.p50": "Per Token (s)",
+    "report.decode.throughput.value": "Decode (tokens/s)",
+    "report.decode.efficiency.value": "Energy (tokens/kWh)",
+    "report.decode.memory.max_allocated": "Memory (MB)",
+    # deployment settings
+    "config.backend.name": "Backend 🏭",
+    "config.backend.torch_dtype": "Precision 📥",
+    "quantization": "Quantization 🗜️",
+    "attention": "Attention 👁️",
+    "kernel": "Kernel ⚛️",
+    # additional information
+    "architecture": "Architecture 🏛️",
+    "prefill+decode": "End-to-End (s)",
+    "Average ⬆️": "Open LLM Score (%)",
+    "#Params (B)": "Params (B)",
+}
+SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
+SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
+SORTING_ASCENDING = [False, True, False]
+BGB_SORTING_COLUMNS = ["Average"]
+# Use the above capabilities to create the columns
+BGB_COLUMNS_MAPPING = {
+    "model_name_or_path": "Model 🤗",
+    "model_params": "Model Params (B)",
+    "model_type": "Model Type",
+    "average": "Average",
+    "grounding": "Grounding ⚡️",
+    "instruction_following": "Instruction Following 📝",
+    "planning": "Planning 📅",
+    "reasoning": "Reasoning 💡",
+    "refinement": "Refinement 🔩",
+    "safety": "Safety ⚠️",
+    "theory_of_mind": "Theory of Mind 🤔",
+    "tool_usage": "Tool Usage 🛠️",
+    "multilingual": "Multilingual 🇬🇫",
+}
+def get_raw_llm_perf_df(machine: str = "1xA10"):
+    dfs = []
+    for subset in SUBSETS:
+        try:
+            dfs.append(
+                pd.read_csv(f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{subset}-{machine}.csv")
+            )
+        except Exception:
+            print(f"Subset {subset} for machine {machine} not found")
+    perf_df = pd.concat(dfs)
+    llm_df = pd.read_csv("hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv")
+    llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="config.backend.model")
+    return llm_perf_df
+def processed_llm_perf_df(llm_perf_df):
+    # some assertions
+    assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
+    assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
+    assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
+    assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
+    # fix couple stuff
+    llm_perf_df.dropna(subset=["report.decode.latency.p50"], inplace=True)
+    llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace("flash_attention_2", "fa2")
+    llm_perf_df["prefill+decode"] = llm_perf_df["report.prefill.latency.p50"] + (
+        llm_perf_df["report.decode.latency.p50"]
+    )
+    # llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
+    #     process_architectures
+    # )
+    llm_perf_df["architecture"] = llm_perf_df["Architecture"]
+    llm_perf_df["attention"] = (
+        llm_perf_df["config.backend.attn_implementation"]
+        .str.replace("flash_attention_2", "FAv2")
+        .str.replace("eager", "Eager")
+        .str.replace("sdpa", "SDPA")
+    )
+    llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
+    llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
+    # round numerical columns
+    llm_perf_df = llm_perf_df.round(
+        {
+            "report.prefill.latency.p50": 3,
+            "report.decode.latency.p50": 3,
+            "report.decode.throughput.value": 3,
+            "report.decode.efficiency.value": 3,
+            "report.decode.memory.max_allocated": 3,
+            "Average ⬆️": 3,
+            "prefill+decode": 3,
+            "#Params (B)": 3,
+        }
+    )
+    # filter columns
+    llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
+    # rename columns
+    llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
+    # sort by metric
+    llm_perf_df.sort_values(
+        by=SORTING_COLUMNS,
+        ascending=SORTING_ASCENDING,
+        inplace=True,
+    )
+    return llm_perf_df
+def get_llm_perf_df(machine: str = "1xA10"):
+    if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
+        llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
+    else:
+        llm_perf_df = get_raw_llm_perf_df(machine)
+        llm_perf_df = processed_llm_perf_df(llm_perf_df)
+        llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
+    return llm_perf_df
+def get_eval_df(eval_model_name: str):
+    assert eval_model_name in ["gpt-4-turbo-2024-04-09", "prometheus-bgb-8x7b-v2.0"]
+    base_dir = Path(__file__).parent.parent / "data"
+    filepath = base_dir / f"bgb-leaderboard-{eval_model_name}.pkl"
+    # For debugging
+    csv_filepath = base_dir / f"bgb-leaderboard-{eval_model_name}.csv"
+    def change_model_name(model_name: str):
+        # TODO: Hard code models with different names
+        model_name_or_path = MODEL_SHORT_TO_LONG.get(model_name, model_name)
+        if model_name == "qwen/qwen-110b-chat":
+            model_name_or_path = "Qwen/Qwen1.5-110B-Chat"
+        if model_name_or_path.endswith("-hjpark"):
+            model_name_or_path = model_name_or_path.replace("-hjpark", "")
+        return model_name_or_path
+    if os.path.exists(filepath) and False:
+        eval_df = pd.read_pickle(filepath)
+    else:
+        # Process the df
+        raw_filepath = base_dir / f"eval_by_{eval_model_name}.csv"
+        eval_df = pd.read_csv(raw_filepath)
+        eval_df["model_name_or_path"] = eval_df["model_name"].apply(lambda x: change_model_name(x))
+        eval_df.drop(columns=["model_name"], inplace=True)
+        eval_df["model_params"] = eval_df["model_name_or_path"].apply(
+            lambda x: MODEL_MAPPING.get(x, ["Unknown", "Unknown"])[0]
+        )
+        eval_df["model_type"] = eval_df["model_name_or_path"].apply(
+            lambda x: MODEL_MAPPING.get(x, ["Unknown", "Unknown"])[1]
+        )
+        capabilities = [
+            "grounding",
+            "instruction_following",
+            "planning",
+            "reasoning",
+            "refinement",
+            "safety",
+            "theory_of_mind",
+            "tool_usage",
+            "multilingual",
+        ]
+        # Make the average of the capabilities
+        eval_df["average"] = eval_df[capabilities].mean(axis=1)
+        # Round to 3 decimal places for capabilities and average
+        eval_df = eval_df.round(
+            {
+                "average": 3,
+                "grounding": 3,
+                "instruction_following": 3,
+                "planning": 3,
+                "reasoning": 3,
+                "refinement": 3,
+                "safety": 3,
+                "theory_of_mind": 3,
+                "tool_usage": 3,
+                "multilingual": 3,
+            }
+        )
+        # print(eval_df[eval_df['model_params'] == 'Unknown'])
+        eval_df.rename(columns=BGB_COLUMNS_MAPPING, inplace=True)
+        eval_df.sort_values(
+            by=BGB_SORTING_COLUMNS,
+            ascending=False,
+            inplace=True,
+        )
+        eval_df.to_pickle(str(filepath))
+        eval_df.to_csv(str(csv_filepath), index=False)
+    # import pdb; pdb.set_trace()
+    return eval_df
+if __name__ == "__main__":
+    get_eval_df("gpt-4-turbo-2024-04-09")
+    get_eval_df("prometheus-bgb-8x7b-v2.0")

src/model_card.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import json
+import os
+import re
+from collections import defaultdict
+from datetime import datetime, timedelta, timezone
+import huggingface_hub
+from huggingface_hub import ModelCard
+from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
+from transformers import AutoConfig, AutoTokenizer
+# ht to @Wauplin, thank you for the snippet!
+# See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
+def check_model_card(repo_id: str) -> tuple[bool, str]:
+    # Returns operation status, and error message
+    try:
+        card = ModelCard.load(repo_id)
+    except huggingface_hub.utils.EntryNotFoundError:
+        return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
+    # Enforce license metadata
+    if card.data.license is None:
+        if not ("license_name" in card.data and "license_link" in card.data):
+            return (
+                False,
+                (
+                    "License not found. Please add a license to your model card using the `license` metadata or a"
+                    " `license_name`/`license_link` pair."
+                ),
+                None,
+            )
+    # Enforce card content
+    if len(card.text) < 200:
+        return False, "Please add a description to your model card, it is too short.", None
+    return True, "", card
+def is_model_on_hub(
+    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
+) -> tuple[bool, str, AutoConfig]:
+    try:
+        config = AutoConfig.from_pretrained(
+            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+        )  # , force_download=True)
+        if test_tokenizer:
+            try:
+                tk = AutoTokenizer.from_pretrained(
+                    model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+                )
+            except ValueError as e:
+                return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
+            except Exception:
+                return (
+                    False,
+                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
+                    None,
+                )
+        return True, None, config
+    except ValueError:
+        return (
+            False,
+            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None,
+        )
+    except Exception as e:
+        if "You are trying to access a gated repo." in str(e):
+            return True, "uses a gated model.", None
+        return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
+def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
+    safetensors = None
+    try:
+        safetensors = get_safetensors_metadata(model_info.id)
+    except Exception as e:
+        print(e)
+    if safetensors is not None:
+        model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
+    else:
+        try:
+            size_match = re.search(size_pattern, model_info.id.lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def get_model_arch(model_info: ModelInfo):
+    return model_info.config.get("architectures", "Unknown")
+def get_model_tags(model_card, model: str):
+    is_merge_from_metadata = False
+    is_moe_from_metadata = False
+    tags = []
+    if model_card is None:
+        return tags
+    if model_card.data.tags:
+        is_merge_from_metadata = any(
+            [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
+        )
+        is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
+    is_merge_from_model_card = any(
+        keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
+    )
+    if is_merge_from_model_card or is_merge_from_metadata:
+        tags.append("merge")
+    is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
+    # Hardcoding because of gating problem
+    if "Qwen/Qwen1.5-32B" in model:
+        is_moe_from_model_card = False
+    is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
+    if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
+        tags.append("moe")
+    return tags
+def test():
+    model = "meta-llama/Meta-Llama-3-8B-Instruct"
+    # Test check_model_card
+    status, error, card = check_model_card(model)
+    # Test is_model_on_hub
+    status2, error2, config2 = is_model_on_hub(model, "main")
+    assert status == True
+    print(status2, error2, config2)
+    # Test get_model_size
+    model_info = ModelInfo(id=model)
+    precision = "GPTQ"
+    model_size = get_model_size(model_info, precision)
+    print(model_size)
+    import pdb
+    pdb.set_trace()
+    # Test get_model_arch
+    # model_arch = get_model_arch(model_info)
+    pass
+if __name__ == "__main__":
+    test()

src/model_list.py ADDED Viewed

	@@ -0,0 +1,529 @@

+MODELS = {
+    "pretrained": {
+        "<=4B": [
+            "microsoft/phi-1",
+            "microsoft/phi-1_5",
+            "microsoft/phi-2",
+            "Qwen/Qwen1.5-0.5B",
+            "Qwen/Qwen1.5-1.8B",
+            "Qwen/Qwen1.5-4B",
+            "google/gemma-2b",
+            "allenai/OLMo-1B",
+        ],
+        "<=7B": [
+            "google/gemma-7b",
+            "mistralai/Mistral-7B-v0.1",
+            "Qwen/Qwen1.5-7B",
+            "01-ai/Yi-6B",
+            "meta-llama/Llama-2-7b-hf",
+            "codellama/CodeLlama-7b-hf",
+            "EleutherAI/llemma_7b",
+            "allenai/OLMo-7B",
+            "mistral-community/Mistral-7B-v0.2",
+        ],
+        "<=14B": [
+            "Qwen/Qwen1.5-14B",
+            "meta-llama/Llama-2-13b-hf",
+            "codellama/CodeLlama-13b-hf",
+            "upstage/SOLAR-10.7B-v1.0",
+            "meta-llama/Meta-Llama-3-8B",
+        ],
+        "<=50B": [
+            "01-ai/Yi-34B",
+            "EleutherAI/llemma_34b",
+            "codellama/CodeLlama-34b-hf",
+            "mistralai/Mixtral-8x7B-v0.1",
+            "Qwen/Qwen1.5-32B",
+        ],
+        "<=75B": [
+            "meta-llama/Llama-2-70b-hf",
+            "codellama/CodeLlama-70b-hf",
+            "meta-llama/Meta-Llama-3-70B",
+            "Qwen/Qwen1.5-72B",
+        ],
+        "<=175B": [
+            "mistral-community/Mixtral-8x22B-v0.1-AWQ",
+        ],
+    },
+    "instruction_tuned": {
+        "<=4B": [
+            "Qwen/Qwen1.5-0.5B-Chat",
+            "Qwen/Qwen1.5-1.8B-Chat",
+            "Qwen/Qwen1.5-4B-Chat",
+            "google/gemma-2b-it",
+            "google/gemma-1.1-2b-it",
+            "microsoft/Phi-3-mini-4k-instruct",
+            "microsoft/Phi-3-mini-128k-instruct",
+        ],
+        "<=7B": [
+            "google/gemma-7b-it",
+            "mistralai/Mistral-7B-Instruct-v0.2",
+            "Qwen/Qwen1.5-7B-Chat",
+            "01-ai/Yi-6B-Chat",
+            "meta-llama/Llama-2-7b-chat-hf",
+            "codellama/CodeLlama-7b-Instruct-hf",
+            "allenai/OLMo-7B-SFT",
+            "allenai/OLMo-7B-Instruct",
+            "allenai/tulu-2-7b",
+            "allenai/tulu-2-dpo-7b",
+            "allenai/codetulu-2-7b",
+            "microsoft/Orca-2-7b",
+            "openchat/openchat-3.5-0106",
+            "teknium/OpenHermes-2-Mistral-7B",
+            "teknium/OpenHermes-2.5-Mistral-7B",
+            "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
+            "HuggingFaceH4/zephyr-7b-beta",
+            "berkeley-nest/Starling-LM-7B-alpha",
+            "Nexusflow/Starling-LM-7B-beta",
+            "kaist-ai/mistral-orpo-alpha",
+            "kaist-ai/mistral-orpo-beta",
+            "google/gemma-1.1-7b-it",
+        ],
+        "<=14B": [
+            "Qwen/Qwen1.5-14B-Chat",
+            "meta-llama/Llama-2-13b-chat-hf",
+            "codellama/CodeLlama-13b-Instruct-hf",
+            "allenai/tulu-2-13b",
+            "allenai/tulu-2-dpo-13b",
+            "allenai/codetulu-2-13b",
+            "microsoft/Orca-2-13b",
+            "upstage/SOLAR-10.7B-Instruct-v1.0",
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+            "CohereForAI/aya-101",
+        ],
+        "<=50B": [
+            "01-ai/Yi-34B-Chat",
+            "codellama/CodeLlama-34b-Instruct-hf",
+            "allenai/codetulu-2-34b",
+            "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
+            "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+            "NousResearch/Nous-Hermes-2-Yi-34B",
+            "CohereForAI/c4ai-command-r-v01",
+            "Qwen/Qwen1.5-32B-Chat",
+        ],
+        "<=75B": [
+            "meta-llama/Llama-2-70b-chat-hf",
+            "codellama/CodeLlama-70b-Instruct-hf",
+            "Qwen/Qwen1.5-72B-Chat",
+            "allenai/tulu-2-dpo-70b",
+            "meta-llama/Meta-Llama-3-70B-Instruct",
+        ],
+        "<=175B": [
+            "alpindale/c4ai-command-r-plus-GPTQ",
+            "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ",
+            "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ",
+        ],
+    },
+}
+API_MODELS = [
+    "gpt-3.5-turbo-0125",
+    "gpt-3.5-turbo-1106",
+    "gpt-4-0125-preview",
+    "gpt-4-1106-preview",
+    "gpt-4-turbo-2024-04-09",
+    "gpt-4o-2024-05-13",
+    "claude-3-haiku-20240307",
+    "claude-3-opus-20240229",
+    "claude-3-sonnet-20240229",
+    "mistral-large",
+    "mistral-medium",
+    "gemini-1.0-pro",
+    "gemini-pro-1.5",
+    "google/gemini-flash-1.5",
+    "qwen/qwen-110b-chat",
+]
+ORDERED_MODELS = [
+    "microsoft/phi-1",
+    "microsoft/phi-1_5",
+    "microsoft/phi-2",
+    "Qwen/Qwen1.5-0.5B",
+    "Qwen/Qwen1.5-1.8B",
+    "Qwen/Qwen1.5-4B",
+    "google/gemma-2b",
+    "allenai/OLMo-1B",
+    "Qwen/Qwen1.5-0.5B-Chat",
+    "Qwen/Qwen1.5-1.8B-Chat",
+    "Qwen/Qwen1.5-4B-Chat",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "microsoft/Phi-3-mini-128k-instruct",
+    "google/gemma-2b-it",
+    "google/gemma-1.1-2b-it",
+    "google/gemma-7b",
+    "mistralai/Mistral-7B-v0.1",
+    "mistral-community/Mistral-7B-v0.2",
+    "Qwen/Qwen1.5-7B",
+    "01-ai/Yi-6B",
+    "meta-llama/Llama-2-7b-hf",
+    "codellama/CodeLlama-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "EleutherAI/llemma_7b",
+    "allenai/OLMo-7B",
+    "google/gemma-7b-it",
+    "google/gemma-1.1-7b-it",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "Qwen/Qwen1.5-7B-Chat",
+    "01-ai/Yi-6B-Chat",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "codellama/CodeLlama-7b-Instruct-hf",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "allenai/OLMo-7B-SFT",
+    "allenai/OLMo-7B-Instruct",
+    "allenai/tulu-2-7b",
+    "allenai/tulu-2-dpo-7b",
+    "allenai/codetulu-2-7b",
+    "microsoft/Orca-2-7b",
+    "openchat/openchat-3.5-0106",
+    "teknium/OpenHermes-2-Mistral-7B",
+    "teknium/OpenHermes-2.5-Mistral-7B",
+    "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
+    "Starling-LM-7B-alpha",
+    "Starling-LM-7B-beta",
+    "kaist-ai/mistral-orpo-alpha",
+    "kaist-ai/mistral-orpo-beta",
+    "HuggingFaceH4/zephyr-7b-beta",
+    "Qwen/Qwen1.5-14B",
+    "meta-llama/Llama-2-13b-hf",
+    "codellama/CodeLlama-13b-hf",
+    "upstage/SOLAR-10.7B-v1.0",
+    "Qwen/Qwen1.5-14B-Chat",
+    "upstage/SOLAR-10.7B-Instruct-v1.0",
+    "CohereForAI/aya-101",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "codellama/CodeLlama-13b-Instruct-hf",
+    "allenai/tulu-2-13b",
+    "allenai/tulu-2-dpo-13b",
+    "allenai/codetulu-2-13b",
+    "microsoft/Orca-2-13b",
+    "01-ai/Yi-34B",
+    "EleutherAI/llemma_34b",
+    "Qwen/Qwen1.5-32B",
+    "codellama/CodeLlama-34b-hf",
+    "mistralai/Mixtral-8x7B-v0.1",
+    "01-ai/Yi-34B-Chat",
+    "NousResearch/Nous-Hermes-2-Yi-34B",
+    "codellama/CodeLlama-34b-Instruct-hf",
+    "allenai/codetulu-2-34b",
+    "Qwen/Qwen1.5-32B-Chat",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
+    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+    "CohereForAI/c4ai-command-r-v01",
+    "meta-llama/Llama-2-70b-hf",
+    "codellama/CodeLlama-70b-hf",
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
+    "meta-llama/Meta-Llama-3-70B",
+    "Qwen/Qwen1.5-72B",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "codellama/CodeLlama-70b-Instruct-hf",
+    "allenai/tulu-2-dpo-70b",
+    "alpindale/c4ai-command-r-plus-GPTQ",
+    "meta-llama/Meta-Llama-3-70B-Instruct",
+    "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ",
+    "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ",
+    "Qwen/Qwen1.5-72B-Chat",
+    "qwen/qwen-110b-chat",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+    "gpt-4-turbo-2024-04-09",
+    "gpt-4o-2024-05-13",
+    "mistral-medium",
+    "mistral-large",
+    "gemini-1.0-pro",
+    "gemini-pro-1.5",
+    "google/gemini-flash-1.5",
+    "claude-3-haiku-20240307",
+    "claude-3-sonnet-20240229",
+    "claude-3-opus-20240229",
+]
+bgb_trained_models = [
+    "microsoft/phi-1",
+    "microsoft/phi-1_5",
+    "microsoft/phi-2",
+    "Qwen/Qwen1.5-0.5B",
+    "Qwen/Qwen1.5-1.8B",
+    "Qwen/Qwen1.5-4B",
+    "google/gemma-2b",
+    "allenai/OLMo-1B",
+    "google/gemma-7b",
+    "mistralai/Mistral-7B-v0.1",
+    "Qwen/Qwen1.5-7B",
+    "01-ai/Yi-6B",
+    "meta-llama/Llama-2-7b-hf",
+    "codellama/CodeLlama-7b-hf",
+    "EleutherAI/llemma_7b",
+    "allenai/OLMo-7B",
+    "Qwen/Qwen1.5-14B",
+    "meta-llama/Llama-2-13b-hf",
+    "codellama/CodeLlama-13b-hf",
+    "upstage/SOLAR-10.7B-v1.0",
+    "01-ai/Yi-34B",
+    "EleutherAI/llemma_34b",
+    "codellama/CodeLlama-34b-hf",
+    "mistralai/Mixtral-8x7B-v0.1",
+    "meta-llama/Llama-2-70b-hf",
+    "codellama/CodeLlama-70b-hf",
+    "Qwen/Qwen1.5-72B",
+    "Qwen/Qwen1.5-0.5B-Chat",
+    "Qwen/Qwen1.5-1.8B-Chat",
+    "Qwen/Qwen1.5-4B-Chat",
+    "google/gemma-2b-it",
+    "google/gemma-7b-it",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "Qwen/Qwen1.5-7B-Chat",
+    "01-ai/Yi-6B-Chat",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "codellama/CodeLlama-7b-Instruct-hf",
+    "allenai/OLMo-7B-SFT",
+    "allenai/OLMo-7B-Instruct",
+    "allenai/tulu-2-7b",
+    "allenai/tulu-2-dpo-7b",
+    "allenai/codetulu-2-7b",
+    "microsoft/Orca-2-7b",
+    "openchat/openchat-3.5-0106",
+    "teknium/OpenHermes-2-Mistral-7B",
+    "teknium/OpenHermes-2.5-Mistral-7B",
+    "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
+    "HuggingFaceH4/zephyr-7b-beta",
+    "Qwen/Qwen1.5-14B-Chat",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "codellama/CodeLlama-13b-Instruct-hf",
+    "allenai/tulu-2-13b",
+    "allenai/tulu-2-dpo-13b",
+    "allenai/codetulu-2-13b",
+    "microsoft/Orca-2-13b",
+    "01-ai/Yi-34B-Chat",
+    "codellama/CodeLlama-34b-Instruct-hf",
+    "allenai/codetulu-2-34b",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "NousResearch/Nous-Hermes-2-Mistral-8x7B-SFT",
+    "NousResearch/Nous-Hermes-2-Mistral-8x7B-DPO",
+    "NousResearch/Nous-Hermes-2-Yi-34B",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "codellama/CodeLlama-70b-Instruct-hf",
+    "Qwen/Qwen1.5-72B-Chat",
+    "allenai/tulu-2-dpo-72b",
+]
+MODEL_MAPPING = {
+    "microsoft/phi-1": [1.3, "Base"],
+    "microsoft/phi-1_5": [1.3, "Base"],
+    "microsoft/phi-2": [2.7, "Base"],
+    "Qwen/Qwen1.5-0.5B": [0.5, "Base"],
+    "Qwen/Qwen1.5-1.8B": [1.8, "Base"],
+    "Qwen/Qwen1.5-4B": [4.0, "Base"],
+    "google/gemma-2b": [2.0, "Base"],
+    "allenai/OLMo-1B": [1.0, "Base"],
+    "Qwen/Qwen1.5-0.5B-Chat": [0.5, "Chat", "Qwen/Qwen1.5-0.5B"],
+    "Qwen/Qwen1.5-1.8B-Chat": [1.8, "Chat", "Qwen/Qwen1.5-1.8B"],
+    "Qwen/Qwen1.5-4B-Chat": [4.0, "Chat", "Qwen/Qwen1.5-4B"],
+    "microsoft/Phi-3-mini-4k-instruct": [3.8, "Chat"],
+    "microsoft/Phi-3-mini-128k-instruct": [3.8, "Chat"],
+    "google/gemma-2b-it": [2.0, "Chat", "google/gemma-2b"],
+    "google/gemma-1.1-2b-it": [2.0, "Chat"],
+    "google/gemma-7b": [7.0, "Base"],
+    "mistralai/Mistral-7B-v0.1": [7.0, "Base"],
+    "mistral-community/Mistral-7B-v0.2": [7.0, "Base"],
+    "Qwen/Qwen1.5-7B": [7.0, "Base"],
+    "01-ai/Yi-6B": [6.0, "Base"],
+    "meta-llama/Llama-2-7b-hf": [7.0, "Base"],
+    "codellama/CodeLlama-7b-hf": [7.0, "Base"],
+    "meta-llama/Meta-Llama-3-8B": [8.0, "Base"],
+    "EleutherAI/llemma_7b": [7.0, "Base"],
+    "allenai/OLMo-7B": [7.0, "Base"],
+    "google/gemma-7b-it": [7.0, "Chat", "google/gemma-7b"],
+    "google/gemma-1.1-7b-it": [7.0, "Chat"],
+    "mistralai/Mistral-7B-Instruct-v0.2": [7.0, "Chat", "mistral-community/Mistral-7B-v0.2"],
+    "Qwen/Qwen1.5-7B-Chat": [7.0, "Chat", "Qwen/Qwen1.5-7B"],
+    "01-ai/Yi-6B-Chat": [6.0, "Chat", "01-ai/Yi-6B"],
+    "meta-llama/Llama-2-7b-chat-hf": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
+    "codellama/CodeLlama-7b-Instruct-hf": [7.0, "Chat", "codellama/CodeLlama-7b-hf"],
+    "meta-llama/Meta-Llama-3-8B-Instruct": [8.0, "Chat", "meta-llama/Meta-Llama-3-8B"],
+    "allenai/OLMo-7B-SFT": [7.0, "Chat", "allenai/OLMo-7B"],
+    "allenai/OLMo-7B-Instruct": [7.0, "Chat", "allenai/OLMo-7B"],
+    "allenai/tulu-2-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
+    "allenai/tulu-2-dpo-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
+    "allenai/codetulu-2-7b": [7.0, "Chat", "codellama/CodeLlama-7b-hf"],
+    "microsoft/Orca-2-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
+    "openchat/openchat-3.5-0106": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "teknium/OpenHermes-2-Mistral-7B": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "teknium/OpenHermes-2.5-Mistral-7B": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "NousResearch/Nous-Hermes-2-Mistral-7B-DPO": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "Starling-LM-7B-alpha": [7.0, "Chat"],
+    "Starling-LM-7B-beta": [7.0, "Chat"],
+    "kaist-ai/mistral-orpo-alpha": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "kaist-ai/mistral-orpo-beta": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "HuggingFaceH4/zephyr-7b-beta": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
+    "Qwen/Qwen1.5-14B": [14.0, "Base"],
+    "meta-llama/Llama-2-13b-hf": [13.0, "Base"],
+    "codellama/CodeLlama-13b-hf": [13.0, "Base"],
+    "upstage/SOLAR-10.7B-v1.0": [10.7, "Base"],
+    "Qwen/Qwen1.5-14B-Chat": [14.0, "Chat", "Qwen/Qwen1.5-14B"],
+    "upstage/SOLAR-10.7B-Instruct-v1.0": [10.7, "Chat", "upstage/SOLAR-10.7B-v1.0"],
+    "CohereForAI/aya-101": [13.0, "Chat"],
+    "meta-llama/Llama-2-13b-chat-hf": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
+    "codellama/CodeLlama-13b-Instruct-hf": [13.0, "Chat", "codellama/CodeLlama-13b-hf"],
+    "allenai/tulu-2-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
+    "allenai/tulu-2-dpo-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
+    "allenai/codetulu-2-13b": [13.0, "Chat", "codellama/CodeLlama-13b-hf"],
+    "microsoft/Orca-2-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
+    "01-ai/Yi-34B": [34.0, "Base"],
+    "EleutherAI/llemma_34b": [34.0, "Base"],
+    "Qwen/Qwen1.5-32B": [32.0, "Base"],
+    "codellama/CodeLlama-34b-hf": [34.0, "Base"],
+    "mistralai/Mixtral-8x7B-v0.1": [46.7, "Base"],
+    "01-ai/Yi-34B-Chat": [34.0, "Chat", "01-ai/Yi-34B"],
+    "NousResearch/Nous-Hermes-2-Yi-34B": [34.0, "Chat", "01-ai/Yi-34B"],
+    "codellama/CodeLlama-34b-Instruct-hf": [34.0, "Chat", "codellama/CodeLlama-34b-hf"],
+    "allenai/codetulu-2-34b": [34.0, "Chat", "codellama/CodeLlama-34b-hf"],
+    "Qwen/Qwen1.5-32B-Chat": [32.0, "Chat", "Qwen/Qwen1.5-32B"],
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
+    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
+    "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
+    "CohereForAI/c4ai-command-r-v01": [35.0, "Chat"],
+    "meta-llama/Llama-2-70b-hf": [70.0, "Base"],
+    "codellama/CodeLlama-70b-hf": [70.0, "Base"],
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ": ["AWQ", "Base"],
+    "meta-llama/Meta-Llama-3-70B": [70.0, "Base"],
+    "Qwen/Qwen1.5-72B": [72.0, "Base"],
+    "meta-llama/Llama-2-70b-chat-hf": [70.0, "Chat", "meta-llama/Llama-2-70b-hf"],
+    "codellama/CodeLlama-70b-Instruct-hf": [70.0, "Chat", "codellama/CodeLlama-70b-hf"],
+    "allenai/tulu-2-dpo-70b": [70.0, "Chat", "meta-llama/Llama-2-70b-hf"],
+    "alpindale/c4ai-command-r-plus-GPTQ": ["GPTQ", "Chat"],
+    "meta-llama/Meta-Llama-3-70B-Instruct": [70.0, "Chat", "meta-llama/Meta-Llama-3-70B"],
+    "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ": ["AWQ", "Chat", "mistral-community/Mixtral-8x22B-v0.1-AWQ"],
+    "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ": ["AWQ", "Chat", "mistral-community/Mixtral-8x22B-v0.1-AWQ"],
+    "Qwen/Qwen1.5-72B-Chat": [72.0, "Chat", "Qwen/Qwen1.5-72B"],
+    "qwen/qwen-110b-chat": [110.0, "Chat", None],
+    "gpt-3.5-turbo-1106": ["Proprietary", "Proprietary"],
+    "gpt-3.5-turbo-0125": ["Proprietary", "Proprietary"],
+    "gpt-4-1106-preview": ["Proprietary", "Proprietary"],
+    "gpt-4-0125-preview": ["Proprietary", "Proprietary"],
+    "gpt-4-turbo-2024-04-09": ["Proprietary", "Proprietary"],
+    "gpt-4o-2024-05-13": ["Proprietary", "Proprietary"],
+    "mistral-medium": ["Proprietary", "Proprietary"],
+    "mistral-large": ["Proprietary", "Proprietary"],
+    "gemini-1.0-pro": ["Proprietary", "Proprietary"],
+    "gemini-pro-1.5": ["Proprietary", "Proprietary"],
+    "google/gemini-flash-1.5": ["Proprietary", "Proprietary"],
+    "claude-3-haiku-20240307": ["Proprietary", "Proprietary"],
+    "claude-3-sonnet-20240229": ["Proprietary", "Proprietary"],
+    "claude-3-opus-20240229": ["Proprietary", "Proprietary"],
+}
+MODEL_SHORT_TO_LONG = {model.split("/")[-1]: model for model in ORDERED_MODELS}
+def get_model_type(model_name: str) -> str:
+    for _, model_list in MODELS["pretrained"].items():
+        if model_name in model_list:
+            return "base"
+    for _, model_list in MODELS["instruction_tuned"].items():
+        if model_name in model_list:
+            return "instruct"
+    if model_name in API_MODELS:
+        return "api"
+    raise ValueError(f"Model {model_name} not found in model_list.py")
+    return None
+def get_open_model_list() -> list:
+    all_models = []
+    for _, model_list in MODELS["pretrained"].items():
+        all_models.extend(model_list)
+    for _, model_list in MODELS["instruction_tuned"].items():
+        all_models.extend(model_list)
+    return all_models
+def get_all_model_list() -> list:
+    all_models = []
+    for _, model_list in MODELS["pretrained"].items():
+        all_models.extend(model_list)
+    for _, model_list in MODELS["instruction_tuned"].items():
+        all_models.extend(model_list)
+    all_models.extend(API_MODELS)
+    return all_models
+def get_pretrained_models() -> list:
+    all_models = []
+    for _, model_list in MODELS["pretrained"].items():
+        all_models.extend(model_list)
+    return all_models
+def get_instruct_models() -> list:
+    all_models = []
+    for _, model_list in MODELS["instruction_tuned"].items():
+        all_models.extend(model_list)
+    return all_models
+def get_model_params(model_name: str) -> int:
+    for size_range, model_list in MODELS["pretrained"].items():
+        if model_name in model_list:
+            return int(size_range.split("B")[0].replace("<=", ""))
+    for size_range, model_list in MODELS["instruction_tuned"].items():
+        if model_name in model_list:
+            return int(size_range.split("B")[0].replace("<=", ""))
+    raise ValueError(f"Model {model_name} not found in model_list.py")
+def get_model_num_gpus(model_name: str) -> int:
+    model_params = get_model_params(model_name)
+    num_gpus = {
+        4: 1,
+        7: 1,
+        14: 2,
+        50: 4,
+        75: 8,
+        175: 4,
+    }[model_params]
+    return num_gpus
+def get_not_trained_models() -> list:
+    all_models = get_all_model_list()
+    trained_models = bgb_trained_models
+    not_trained_models = [model for model in all_models if model not in trained_models]
+    return not_trained_models
+def is_trained_model(model_name: str) -> bool:
+    return model_name in bgb_trained_models
+if __name__ == "__main__":
+    assert get_model_type("microsoft/phi-1"), "base"
+    assert get_model_params("microsoft/phi-2"), 4
+    models = get_all_model_list()
+    model_list_str = ""
+    for model in models:
+        model_list_str += f'"{model}"\n'
+    print(model_list_str)
+    print(f"{len(models)} models found in src/model_list.py")
+    print(get_not_trained_models())

src/panel.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+from src.leaderboard import BGB_COLUMN_MAPPING, get_bgb_leaderboard_df, get_leaderboard_df
+from src.llm_perf import get_eval_df, get_llm_perf_df
+def select_columns_fn(machine, columns, search, llm_perf_df=None):
+    if llm_perf_df is None:
+        llm_perf_df = get_llm_perf_df(machine=machine)
+    selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
+    selected_leaderboard_df = selected_leaderboard_df[
+        selected_leaderboard_df["Model 🤗"].str.contains(search, case=False)
+    ]
+    selected_leaderboard_df = selected_leaderboard_df[columns]
+    return selected_leaderboard_df
+def select_columns_bgb_fn(machine, columns, search, type_checkboxes, param_slider, eval_df=None):
+    if eval_df is None:
+        eval_df = get_eval_df(machine)
+    selected_leaderboard_df = get_bgb_leaderboard_df(eval_df)
+    selected_leaderboard_df = selected_leaderboard_df[
+        selected_leaderboard_df["Model 🤗"].str.contains(search, case=False)
+    ]
+    print(param_slider)
+    import pdb
+    pdb.set_trace()
+    columns = ["Model 🤗"] + columns + type_checkboxes
+    return selected_leaderboard_df[columns]
+def create_select_callback(
+    # fixed
+    machine_textbox,
+    # interactive
+    columns_checkboxes,
+    search_bar,
+    type_checkboxes,
+    param_slider,
+    # outputs
+    leaderboard_table,
+):
+    columns_checkboxes.change(
+        fn=select_columns_bgb_fn,
+        inputs=[machine_textbox, columns_checkboxes, search_bar, type_checkboxes, param_slider],
+        outputs=[leaderboard_table],
+    )
+    search_bar.change(
+        fn=select_columns_bgb_fn,
+        inputs=[machine_textbox, columns_checkboxes, search_bar, type_checkboxes, param_slider],
+        outputs=[leaderboard_table],
+    )

src/utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from transformers import AutoConfig
+LLM_MODEL_ARCHS = {
+    "stablelm_epoch": "🔴 StableLM-Epoch",
+    "stablelm_alpha": "🔴 StableLM-Alpha",
+    "mixformer-sequential": "🧑‍💻 Phi φ",
+    "RefinedWebModel": "🦅 Falcon",
+    "gpt_bigcode": "⭐ StarCoder",
+    "RefinedWeb": "🦅 Falcon",
+    "baichuan": "🌊 Baichuan 百川",  # river
+    "internlm": "🧑‍🎓 InternLM 书生",  # scholar
+    "mistral": "Ⓜ️ Mistral",
+    "mixtral": "Ⓜ️ Mixtral",
+    "codegen": "♾️ CodeGen",
+    "chatglm": "💬 ChatGLM",
+    "falcon": "🦅 Falcon",
+    "bloom": "🌸 Bloom",
+    "llama": "🦙 LLaMA",
+    "rwkv": "🐦‍⬛ RWKV",
+    "deci": "🔵 deci",
+    "Yi": "🫂 Yi 人",  # people
+    "mpt": "🧱 MPT",
+    # suggest something
+    "gpt_neox": "GPT-NeoX",
+    "gpt_neo": "GPT-Neo",
+    "gpt2": "GPT-2",
+    "gptj": "GPT-J",
+    "bart": "BART",
+}
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def process_architectures(model):
+    # return "Unknown"
+    try:
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+        return LLM_MODEL_ARCHS.get(config.model_type, "Unknown")
+    except Exception:
+        return "Unknown"
+def process_score(score, quantization):
+    if quantization != "Unquantized":
+        return f"{score:.2f}*"
+    else:
+        return f"{score:.2f} "
+def process_quantizations(x):
+    if (
+        x["config.backend.quantization_scheme"] == "bnb"
+        and x["config.backend.quantization_config.load_in_4bit"] is True
+    ):
+        return "BnB.4bit"
+    elif (
+        x["config.backend.quantization_scheme"] == "bnb"
+        and x["config.backend.quantization_config.load_in_8bit"] is True
+    ):
+        return "BnB.8bit"
+    elif x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.bits"] == 4:
+        return "GPTQ.4bit"
+    elif x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.bits"] == 4:
+        return "AWQ.4bit"
+    else:
+        return "Unquantized"
+def process_kernels(x):
+    if x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.version"] == 1:
+        return "GPTQ.ExllamaV1"
+    elif x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.version"] == 2:
+        return "GPTQ.ExllamaV2"
+    elif (
+        x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.version"] == "gemm"
+    ):
+        return "AWQ.GEMM"
+    elif (
+        x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.version"] == "gemv"
+    ):
+        return "AWQ.GEMV"
+    else:
+        return "No Kernel"
+def test():
+    model = "Qwen/Qwen1.5-32B"
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    import pdb
+    pdb.set_trace()
+if __name__ == "__main__":
+    test()