open_pl_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Jun 22, 2023

Commit

6e8f400

1 Parent(s): d52179b

revamp

Browse files

Files changed (5) hide show

app.py +132 -235
src/assets/text_content.py +9 -26
src/elo_leaderboard/load_results.py +0 -200
src/elo_leaderboard/visualizations.py +0 -137
src/init.py +2 -28

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ from transformers import AutoConfig
 from src.auto_leaderboard.get_model_metadata import apply_metadata
 from src.assets.text_content import *
-from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
 from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
 from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 from src.assets.css_html_js import custom_css, get_window_url_params
@@ -22,8 +21,6 @@ from src.init import load_all_info_from_hub
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
-HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
-GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 ADD_PLOTS = False
@@ -37,7 +34,7 @@ def restart_space():
         repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
     )
-auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -53,10 +50,6 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
-ELO_COLS = [c.name for c in fields(EloEvalColumn)]
-ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
-ELO_SORT_COL = EloEvalColumn.gpt4.name
 def has_no_nan_values(df, columns):
     return df[columns].notna().all(axis=1)
@@ -138,41 +131,6 @@ def get_evaluation_queue_df():
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
-def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
-    if human_eval_repo:
-        print("Pulling human_eval_repo changes")
-        human_eval_repo.git_pull()
-    all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
-    dataframe = pd.DataFrame.from_records(all_data)
-    dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
-    dataframe = dataframe[ELO_COLS]
-    return dataframe
-def get_elo_elements():
-    df_instruct = pd.read_json("human_evals/without_code.json")
-    df_code_instruct = pd.read_json("human_evals/with_code.json")
-    elo_leaderboard = get_elo_leaderboard(
-        df_instruct, df_code_instruct, tie_allowed=False
-    )
-    elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
-        df_instruct, df_code_instruct, tie_allowed=True
-    )
-    plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
-        df_instruct, df_code_instruct, tie_allowed=False
-    )
-    return (
-        elo_leaderboard,
-        elo_leaderboard_with_tie_allowed,
-        plot_1,
-        plot_2,
-        plot_3,
-        plot_4,
-    )
 original_df = get_leaderboard_df()
 leaderboard_df = original_df.copy()
@@ -181,15 +139,6 @@ leaderboard_df = original_df.copy()
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df()
-(
-    elo_leaderboard,
-    elo_leaderboard_with_tie_allowed,
-    plot_1,
-    plot_2,
-    plot_3,
-    plot_4,
-) = get_elo_elements()
 def is_model_on_hub(model_name, revision) -> bool:
     try:
@@ -305,188 +254,153 @@ def change_tab(query_param):
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     with gr.Row():
-        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Row():
-        with gr.Column():
-            with gr.Accordion("📙 Citation", open=False):
-                citation_button = gr.Textbox(
-                    value=CITATION_BUTTON_TEXT,
-                    label=CITATION_BUTTON_LABEL,
-                    elem_id="citation-button",
-                ).style(show_copy_button=True)
-        with gr.Column():
-            with gr.Accordion("✨ CHANGELOG", open=False):
-                changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Column():
-                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-                with gr.Box(elem_id="search-bar-table-box"):
-                    search_bar = gr.Textbox(
-                        placeholder="🔍 Search your model and press ENTER...",
-                        show_label=False,
-                        elem_id="search-bar",
-                    )
-                    with gr.Tabs(elem_classes="tab-buttons"):
-                        with gr.TabItem("Light View"):
-                            leaderboard_table_lite = gr.components.Dataframe(
-                                value=leaderboard_df[COLS_LITE],
-                                headers=COLS_LITE,
-                                datatype=TYPES_LITE,
-                                max_rows=None,
-                                elem_id="leaderboard-table-lite",
-                            )
-                        with gr.TabItem("Extended Model View"):
-                            leaderboard_table = gr.components.Dataframe(
-                                value=leaderboard_df,
-                                headers=COLS,
-                                datatype=TYPES,
-                                max_rows=None,
-                                elem_id="leaderboard-table",
-                            )
-                    # Dummy leaderboard for handling the case when the user uses backspace key
-                    hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                        value=original_df,
-                        headers=COLS,
-                        datatype=TYPES,
-                        max_rows=None,
-                        visible=False,
-                    )
-                    search_bar.submit(
-                        search_table,
-                        [hidden_leaderboard_table_for_search, search_bar],
-                        leaderboard_table,
-                    )
-                    # Dummy leaderboard for handling the case when the user uses backspace key
-                    hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
-                        value=original_df[COLS_LITE],
-                        headers=COLS_LITE,
-                        datatype=TYPES_LITE,
-                        max_rows=None,
-                        visible=False,
-                    )
-                    search_bar.submit(
-                        search_table,
-                        [hidden_leaderboard_table_for_search_lite, search_bar],
-                        leaderboard_table_lite,
-                    )
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Accordion("✅ Finished Evaluations", open=False):
-                    with gr.Row():
-                        finished_eval_table = gr.components.Dataframe(
-                            value=finished_eval_queue_df,
-                            headers=EVAL_COLS,
-                            datatype=EVAL_TYPES,
-                            max_rows=5,
-                        )
-                with gr.Accordion("🔄 Running Evaluation Queue", open=False):
-                    with gr.Row():
-                        running_eval_table = gr.components.Dataframe(
-                            value=running_eval_queue_df,
-                            headers=EVAL_COLS,
-                            datatype=EVAL_TYPES,
-                            max_rows=5,
-                        )
-                with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
-                    with gr.Row():
-                        pending_eval_table = gr.components.Dataframe(
-                            value=pending_eval_queue_df,
-                            headers=EVAL_COLS,
-                            datatype=EVAL_TYPES,
-                            max_rows=5,
-                        )
                 with gr.Row():
-                    refresh_button = gr.Button("Refresh")
-                    refresh_button.click(
-                        refresh,
-                        inputs=[],
-                        outputs=[
-                            leaderboard_table,
-                            finished_eval_table,
-                            running_eval_table,
-                            pending_eval_table,
-                        ],
                     )
-                with gr.Accordion("Submit a new model for evaluation"):
-                    with gr.Row():
-                        with gr.Column():
-                            model_name_textbox = gr.Textbox(label="Model name")
-                            revision_name_textbox = gr.Textbox(
-                                label="revision", placeholder="main"
-                            )
-                        with gr.Column():
-                            is_8bit_toggle = gr.Checkbox(
-                                False, label="8 bit eval", visible=not IS_PUBLIC
-                            )
-                            private = gr.Checkbox(
-                                False, label="Private", visible=not IS_PUBLIC
-                            )
-                            is_delta_weight = gr.Checkbox(False, label="Delta weights")
-                            base_model_name_textbox = gr.Textbox(
-                                label="base model (for delta)"
-                            )
-                    submit_button = gr.Button("Submit Eval")
-                    submission_result = gr.Markdown()
-                    submit_button.click(
-                        add_new_eval,
-                        [
-                            model_name_textbox,
-                            base_model_name_textbox,
-                            revision_name_textbox,
-                            is_8bit_toggle,
-                            private,
-                            is_delta_weight,
-                        ],
-                        submission_result,
                     )
-        with gr.TabItem(
-            "🧑‍⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table", id=1
-        ):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
-                with gr.Column(scale=1):
-                    gr.Image(
-                        "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
                     )
-            gr.Markdown("## No tie allowed")
-            elo_leaderboard_table = gr.components.Dataframe(
-                value=elo_leaderboard,
-                headers=ELO_COLS,
-                datatype=ELO_TYPES,
-                max_rows=5,
-            )
-            gr.Markdown("## Tie allowed*")
-            elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
-                value=elo_leaderboard_with_tie_allowed,
-                headers=ELO_COLS,
-                datatype=ELO_TYPES,
-                max_rows=5,
             )
-            gr.Markdown(
-                "\* Results when the scores of 4 and 5 were treated as ties.",
-                elem_classes="markdown-text",
-            )
-            gr.Markdown(
-                "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
-                elem_id="models-to-add-text",
             )
     dummy = gr.Textbox(visible=False)
     demo.load(
         change_tab,
@@ -494,23 +408,6 @@ with demo:
         tabs,
         _js=get_window_url_params,
     )
-    if ADD_PLOTS:
-        with gr.Box():
-            visualization_title = gr.HTML(VISUALIZATION_TITLE)
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
-                    plot_1 = gr.Plot(plot_1, show_label=False)
-                with gr.Column():
-                    gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
-                    plot_2 = gr.Plot(plot_2, show_label=False)
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
-                    plot_3 = gr.Plot(plot_3, show_label=False)
-                with gr.Column():
-                    gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
-                    plot_4 = gr.Plot(plot_4, show_label=False)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)

 from src.auto_leaderboard.get_model_metadata import apply_metadata
 from src.assets.text_content import *
 from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
 from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 from src.assets.css_html_js import custom_css, get_window_url_params
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 ADD_PLOTS = False
         repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
     )
+auto_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO)
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
 def has_no_nan_values(df, columns):
     return df[columns].notna().all(axis=1)
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
 original_df = get_leaderboard_df()
 leaderboard_df = original_df.copy()
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df()
 def is_model_on_hub(model_name, revision) -> bool:
     try:
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
+        with gr.Box(elem_id="search-bar-table-box"):
+            search_bar = gr.Textbox(
+                placeholder="🔍 Search your model and press ENTER...",
+                show_label=False,
+                elem_id="search-bar",
+            )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark (lite)", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard_table_lite = gr.components.Dataframe(
+                value=leaderboard_df[COLS_LITE],
+                headers=COLS_LITE,
+                datatype=TYPES_LITE,
+                max_rows=None,
+                elem_id="leaderboard-table-lite",
+            )
+            # Dummy leaderboard for handling the case when the user uses backspace key
+            hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
+                value=original_df[COLS_LITE],
+                headers=COLS_LITE,
+                datatype=TYPES_LITE,
+                max_rows=None,
+                visible=False,
+            )
+            search_bar.submit(
+                search_table,
+                [hidden_leaderboard_table_for_search_lite, search_bar],
+                leaderboard_table_lite,
+            )
+        with gr.TabItem("📊 Extended view", elem_id="llm-benchmark-tab-table", id=1):
+            leaderboard_table = gr.components.Dataframe(
+                value=leaderboard_df,
+                headers=COLS,
+                datatype=TYPES,
+                max_rows=None,
+                elem_id="leaderboard-table",
+            )
+            # Dummy leaderboard for handling the case when the user uses backspace key
+            hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=original_df,
+                headers=COLS,
+                datatype=TYPES,
+                max_rows=None,
+                visible=False,
+            )
+            search_bar.submit(
+                search_table,
+                [hidden_leaderboard_table_for_search, search_bar],
+                leaderboard_table,
+            )
+        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+    with gr.Column():
+        with gr.Row():
+            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+        with gr.Column():
+            with gr.Accordion("✅ Finished Evaluations", open=False):
                 with gr.Row():
+                    finished_eval_table = gr.components.Dataframe(
+                        value=finished_eval_queue_df,
+                        headers=EVAL_COLS,
+                        datatype=EVAL_TYPES,
+                        max_rows=5,
                     )
+            with gr.Accordion("🔄 Running Evaluation Queue", open=False):
+                with gr.Row():
+                    running_eval_table = gr.components.Dataframe(
+                        value=running_eval_queue_df,
+                        headers=EVAL_COLS,
+                        datatype=EVAL_TYPES,
+                        max_rows=5,
                     )
+            with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
+                with gr.Row():
+                    pending_eval_table = gr.components.Dataframe(
+                        value=pending_eval_queue_df,
+                        headers=EVAL_COLS,
+                        datatype=EVAL_TYPES,
+                        max_rows=5,
                     )
+        with gr.Row():
+            refresh_button = gr.Button("Refresh")
+            refresh_button.click(
+                refresh,
+                inputs=[],
+                outputs=[
+                    leaderboard_table,
+                    finished_eval_table,
+                    running_eval_table,
+                    pending_eval_table,
+                ],
             )
+        with gr.Accordion("Submit a new model for evaluation"):
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(
+                        label="revision", placeholder="main"
+                    )
+                with gr.Column():
+                    is_8bit_toggle = gr.Checkbox(
+                        False, label="8 bit eval", visible=not IS_PUBLIC
+                    )
+                    private = gr.Checkbox(
+                        False, label="Private", visible=not IS_PUBLIC
+                    )
+                    is_delta_weight = gr.Checkbox(False, label="Delta weights")
+                    base_model_name_textbox = gr.Textbox(
+                        label="base model (for delta)"
+                    )
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    is_8bit_toggle,
+                    private,
+                    is_delta_weight,
+                ],
+                submission_result,
             )
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    elem_id="citation-button",
+                ).style(show_copy_button=True)
+        with gr.Column():
+            with gr.Accordion("✨ CHANGELOG", open=False):
+                changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
     dummy = gr.Textbox(visible=False)
     demo.load(
         change_tab,
         tabs,
         _js=get_window_url_params,
     )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)

src/assets/text_content.py CHANGED Viewed

@@ -57,15 +57,16 @@ CHANGELOG_TEXT = f"""
 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
 INTRODUCTION_TEXT = f"""
-📐 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
-🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
-📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
 """
 LLM_BENCHMARKS_TEXT = f"""
-Evaluation is performed against 4 popular benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
@@ -74,17 +75,9 @@ Evaluation is performed against 4 popular benchmarks:
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 """
-HUMAN_GPT_EVAL_TEXT = f"""
-Evaluation is performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts. The prompts cover tasks such as brainstorming, creative generation, commonsense reasoning, open question answering, summarization, and code generation. Comparisons are made by humans and a model on a 1-8 Likert scale, where the labeler is required to choose a preference each time. Using these preferences, we create bootstrapped Elo rankings.
-We collaborated with **Scale AI** to generate the completions using a professional data labeling workforce on their platform, [following the labeling instructions found here](https://docs.google.com/document/d/1c5-96Lj-UH4lzKjLvJ_MRQaVMjtoEXTYA4dvoAYVCHc/edit?usp=sharing). To understand the evaluation of popular models, we also had GPT-4 label the completions using this prompt.
-For more information on the calibration and initiation of these measurements, please refer to the [announcement blog post](https://huggingface.co/blog/llm-leaderboard). We would like to express our gratitude to **LMSYS** for providing a [useful notebook](https://colab.research.google.com/drive/1lAQ9cKVErXI1rEYq7hTKNaCQ5Q8TzrI5?usp=sharing) for computing Elo estimates and plots.
-"""
 EVALUATION_QUEUE_TEXT = f"""
-# Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -153,14 +146,4 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
       eprint={2109.07958},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
-}"""
-VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
-PLOT_1_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"
-PLOT_2_TITLE = "Comparison Count of Each Combination of Models (not allowing ties)"
-PLOT_3_TITLE = "Elo Estimates with error bars (ties allowed)"
-PLOT_4_TITLE = "Fraction of Model A Wins for All Non-tied A vs. B Comparisons"

 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
 INTRODUCTION_TEXT = f"""
+📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
+🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
 """
 LLM_BENCHMARKS_TEXT = f"""
+With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
+📈 We evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 """
 EVALUATION_QUEUE_TEXT = f"""
+# Evaluation Queue for the 🤗 Open LLM Leaderboard
+These models will be automatically evaluated on the 🤗 cluster.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
       eprint={2109.07958},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
+}"""

src/elo_leaderboard/load_results.py DELETED Viewed

@@ -1,200 +0,0 @@
-from collections import defaultdict
-from dataclasses import dataclass
-from typing import Dict, List
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from src.assets.text_content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
-from src.utils_display import make_clickable_model, EloEvalColumn
-from .visualizations import (
-    get_bootstrap_result,
-    switch_model_a_b,
-    visualize_battle_count,
-    visualize_bootstrap_scores,
-    visualize_pairwise_win_fraction,
-    visualize_rating_count,
-)
-@dataclass
-class EloEvalResult:
-    model: str
-    gpt_4_all: int
-    human_all: int
-    human_instruct: int
-    human_code_instruct: int
-    tie_allowed: bool
-    def to_dict(self):
-        base_model = f"{self.model}"
-        data_dict = {}
-        data_dict[EloEvalColumn.model.name] = make_clickable_model(base_model)
-        data_dict[EloEvalColumn.gpt4.name] = self.gpt_4_all
-        data_dict[EloEvalColumn.human_all.name] = self.human_all
-        data_dict[EloEvalColumn.human_instruct.name] = self.human_instruct
-        data_dict[EloEvalColumn.human_code_instruct.name] = self.human_code_instruct
-        return data_dict
-def create_eval_df(df, tie_allowed):
-    responses = []
-    for _, row in df.iterrows():
-        if row["status"] == "canceled":
-            continue
-        rating = row["response"]["annotations"]["Preference"]
-        if rating == "NaN":
-            continue
-        scores = row["response"]["responses"]
-        if any(s["Preference"] == "" for s in scores):
-            continue
-        response = {
-            "id": row["task_id"],
-            "prompt": row["params"]["templateVariables"]["prompt"],
-            "model_a": row["params"]["templateVariables"]["modela"],
-            "model_b": row["params"]["templateVariables"]["modelb"],
-            "response_a": row["params"]["templateVariables"]["response1"],
-            "response_b": row["params"]["templateVariables"]["response2"],
-            "rating": int(rating),
-            "ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)],
-        }
-        if tie_allowed:
-            response["win"] = (
-                "model_a"
-                if response["rating"] < 4
-                else "model_b"
-                if response["rating"] > 5
-                else "tie"
-            )
-        else:
-            response["win"] = "model_a" if response["rating"] < 5 else "model_b"
-        responses.append(response)
-    return pd.DataFrame(responses)
-def create_eval_df_for_gpt(df, tie_allowed):
-    responses = []
-    for _, row in df.iterrows():
-        response = {
-            "id": row["review_id"],
-            "prompt": row["question"],
-            "model_a": row["model1"],
-            "model_b": row["model2"],
-            "response_a": row["answer1"],
-            "response_b": row["answer2"],
-            "rating": row["score"][0],
-        }
-        if tie_allowed:
-            response["win"] = (
-                "model_a"
-                if response["rating"] < 4
-                else "model_b"
-                if response["rating"] > 5
-                else "tie"
-            )
-        else:
-            response["win"] = "model_a" if response["rating"] < 5 else "model_b"
-        responses.append(response)
-    return pd.DataFrame(responses)
-# Compute the Elo rating for each model
-def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000):
-    rating = defaultdict(lambda: initial_rating)
-    for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples():
-        ra = rating[model_a]
-        rb = rating[model_b]
-        ea = 1 / (1 + base ** ((rb - ra) / scale))
-        eb = 1 / (1 + base ** ((ra - rb) / scale))
-        if win == "model_a":
-            sa = 1
-        elif win == "model_b":
-            sa = 0
-        elif win == "tie" or win == "tie (bothbad)":
-            sa = 0.5
-        else:
-            raise Exception(f"unexpected vote {win}")
-        rating[model_a] += k * (sa - ea)
-        rating[model_b] += k * (1 - sa - eb)
-    return rating
-def convert_rating_from_float_to_int(df):
-    return {model: int(rating) for model, rating in compute_elo(df).items()}
-def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
-    df_all = pd.concat([df_instruct, df_code_instruct])
-    df_gpt_4 = load_dataset(
-        "gpt_4_evals/data/",
-        split="train",
-        revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
-    ).to_pandas()
-    dfs = [df_instruct, df_code_instruct, df_all]
-    elo_ratings = [
-        convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
-        for df in dfs
-    ]
-    gpt_4_elo_ratings = convert_rating_from_float_to_int(
-        create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
-    )
-    elo_ratings.append(gpt_4_elo_ratings)
-    results = [
-        EloEvalResult(
-            model=model_name,
-            gpt_4_all=elo_ratings[3][model_name],
-            human_all=elo_ratings[2][model_name],
-            human_instruct=elo_ratings[0][model_name],
-            human_code_instruct=elo_ratings[1][model_name],
-            tie_allowed=tie_allowed,
-        )
-        for model_name in elo_ratings[0].keys()
-    ]
-    return results
-def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]:
-    eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed)
-    return [r.to_dict() for r in eval_results]
-def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
-    df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed)
-    df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed)
-    df_all = pd.concat([df_instruct, df_code_instruct])
-    game = df_all[["model_a", "model_b", "win"]]
-    game_switch = switch_model_a_b(game)
-    plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE)
-    plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE)
-    BOOTSTRAP_ROUNDS = 1000
-    if "bootstrap_elo_lu" not in globals():
-        bootstrap_elo_lu = get_bootstrap_result(
-            game_switch, compute_elo, BOOTSTRAP_ROUNDS
-        )
-    plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
-    plot_4 = visualize_rating_count(game, PLOT_4_TITLE)
-    return plot_1, plot_2, plot_3, plot_4

src/elo_leaderboard/visualizations.py DELETED Viewed

@@ -1,137 +0,0 @@
-import math
-import numpy as np
-import pandas as pd
-import plotly.express as px
-# 1
-def compute_pairwise_win_fraction(battles):
-    # Times each model wins as Model A
-    a_win_ptbl = pd.pivot_table(
-        battles[battles["win"] == "model_a"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    # Table counting times each model wins as Model B
-    b_win_ptbl = pd.pivot_table(
-        battles[battles["win"] == "model_b"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    # Table counting number of A-B pairs
-    num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
-    # Computing the proportion of wins for each model as A and as B
-    # against all other models
-    row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T)
-    # Arrange ordering according to proprition of wins
-    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
-    model_names = list(prop_wins.keys())
-    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
-    return row_beats_col
-def visualize_pairwise_win_fraction(battles, title):
-    row_beats_col = compute_pairwise_win_fraction(battles)
-    fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title)
-    fig.update_layout(
-        xaxis_title="Model B",
-        yaxis_title="Model A",
-        xaxis_side="top",
-        title_y=0.07,
-        title_x=0.5,
-    )
-    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
-    return fig
-# 2
-def switch_model_a_b(df):
-    df_switch = df.copy()
-    # switch with probability 0.5
-    for i, row in df.iterrows():
-        if np.random.rand() < 0.5:
-            df_switch.at[i, "model_a"] = row["model_b"]
-            df_switch.at[i, "model_b"] = row["model_a"]
-            if row["win"] == "model_a":
-                df_switch.at[i, "win"] = "model_b"
-            elif row["win"] == "model_b":
-                df_switch.at[i, "win"] = "model_a"
-    return df_switch
-def visualize_battle_count(battles, title):
-    ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
-    battle_counts = ptbl + ptbl.T
-    ordering = battle_counts.sum().sort_values(ascending=False).index
-    fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600)
-    fig.update_layout(
-        xaxis_title="Model B",
-        yaxis_title="Model A",
-        xaxis_side="top",
-        title_y=0.07,
-        title_x=0.5,
-    )
-    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
-    return fig
-# 3
-def get_bootstrap_result(battles, func_compute_elo, num_round):
-    rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)]
-    df = pd.DataFrame(rows)
-    return df[df.median().sort_values(ascending=False).index]
-def visualize_bootstrap_scores(df, title):
-    bars = (
-        pd.DataFrame(
-            dict(
-                lower=df.quantile(0.025),
-                rating=df.quantile(0.5),
-                upper=df.quantile(0.975),
-            )
-        )
-        .reset_index(names="model")
-        .sort_values("rating", ascending=False)
-    )
-    bars["error_y"] = bars["upper"] - bars["rating"]
-    bars["error_y_minus"] = bars["rating"] - bars["lower"]
-    bars["rating_rounded"] = np.round(bars["rating"], 2)
-    fig = px.scatter(
-        bars,
-        x="model",
-        y="rating",
-        error_y="error_y",
-        error_y_minus="error_y_minus",
-        text="rating_rounded",
-        title=title,
-    )
-    fig.update_layout(xaxis_title="Model", yaxis_title="Rating")
-    return fig
-# 4
-def visualize_rating_count(df, title):
-    df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts()
-    fig = px.bar(df_all_value_counts, title=title, text_auto=True)
-    min_y = df_all_value_counts.min()
-    max_y = df_all_value_counts.max()
-    y_end = math.ceil(min_y / 100) * 100
-    y_begin = math.floor(max_y / 100) * 100
-    fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
-    fig.update_yaxes(range=[y_begin, y_end])
-    # save the plot for the blog:
-    fig.write_html("src/assets/model_counts.html", full_html=False, include_plotlyjs="cdn")
-    return fig

src/init.py CHANGED Viewed

@@ -15,15 +15,11 @@ def get_all_requested_models(requested_models_dir):
     return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
-def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
     auto_eval_repo = None
     requested_models = None
     if H4_TOKEN:
         print("Pulling evaluation requests and results.")
-        # try:
-        #     shutil.rmtree("./auto_evals/")
-        # except:
-        #     pass
         auto_eval_repo = Repository(
             local_dir="./auto_evals/",
@@ -36,29 +32,7 @@ def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
         requested_models_dir = "./auto_evals/eval_requests"
         requested_models = get_all_requested_models(requested_models_dir)
-    human_eval_repo = None
-    if H4_TOKEN and not os.path.isdir("./human_evals"):
-        print("Pulling human evaluation repo")
-        human_eval_repo = Repository(
-            local_dir="./human_evals/",
-            clone_from=HUMAN_EVAL_REPO,
-            use_auth_token=H4_TOKEN,
-            repo_type="dataset",
-        )
-        human_eval_repo.git_pull()
-    gpt_4_eval_repo = None
-    if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
-        print("Pulling GPT-4 evaluation repo")
-        gpt_4_eval_repo = Repository(
-            local_dir="./gpt_4_evals/",
-            clone_from=GPT_4_EVAL_REPO,
-            use_auth_token=H4_TOKEN,
-            repo_type="dataset",
-        )
-        gpt_4_eval_repo.git_pull()
-    return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
 #def load_results(model, benchmark, metric):

     return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
+def load_all_info_from_hub(LMEH_REPO):
     auto_eval_repo = None
     requested_models = None
     if H4_TOKEN:
         print("Pulling evaluation requests and results.")
         auto_eval_repo = Repository(
             local_dir="./auto_evals/",
         requested_models_dir = "./auto_evals/eval_requests"
         requested_models = get_all_requested_models(requested_models_dir)
+    return auto_eval_repo, requested_models
 #def load_results(model, benchmark, metric):