Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

BenchmarkBot commited on Jul 6, 2023

Commit

67cbded

1 Parent(s): bf0a261

made scores clickable

Browse files

Files changed (3) hide show

app.py +9 -30
src/assets/css_html_js.py +0 -36
src/assets/text_content.py +7 -9

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import json
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -21,7 +20,7 @@ COLUMNS_MAPPING = {
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
 }
-COLUMNS_DATATYPES = ["markdown", "str", "str", "number", "number", "number"]
 SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
@@ -39,8 +38,8 @@ def get_benchmark_df(benchmark):
     scores_df = pd.read_csv(
         f"./llm-perf-dataset/reports/average_scores.csv")
     bench_df = bench_df.merge(scores_df, on="model", how="left")
-    # bench_df["average"] = bench_df["average"].apply(
-    #     make_clickable_score)
     # preprocess
     bench_df["model"] = bench_df["model"].apply(make_clickable_model)
@@ -54,33 +53,19 @@ def get_benchmark_df(benchmark):
     return bench_df
-# def change_tab(query_param):
-#     query_param = query_param.replace("'", '"')
-#     query_param = json.loads(query_param)
-#     if (
-#         isinstance(query_param, dict)
-#         and "tab" in query_param
-#         and query_param["tab"] == "evaluation"
-#     ):
-#         return gr.Tabs.update(selected=1)
-#     else:
-#         return gr.Tabs.update(selected=0)
 def submit_query(text, backends, datatypes, threshold, raw_df):
     # extract the average score (float) from the clickable score (clickable markdown)
-    # raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
-    #     extract_score_from_clickable)
     filtered_df = raw_df[
         raw_df["Model 🤗"].str.lower().str.contains(text.lower()) &
         raw_df["Backend 🏭"].isin(backends) &
         raw_df["Datatype 📥"].isin(datatypes) &
         (raw_df["Average H4 Score ⬆️"] >= threshold)
     ]
-    # filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
-    #     make_clickable_score)
     return filtered_df
@@ -91,6 +76,7 @@ with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         search_bar = gr.Textbox(
             label="Model 🤗",
@@ -127,6 +113,7 @@ with demo:
             elem_id="submit-button",
         )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🖥️ A100-80GB Benchmark 🏋️", elem_id="A100-benchmark", id=0):
             gr.HTML(SINGLE_A100_TEXT)
@@ -166,14 +153,6 @@ with demo:
                 elem_id="citation-button",
             ).style(show_copy_button=True)
-    # dummy = gr.Textbox(visible=False)
-    # demo.load(
-    #     change_tab,
-    #     dummy,
-    #     tabs,
-    #     _js=get_window_url_params,
-    # )
 # Restart space every hour
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600,

 import os
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
     "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
     "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
 }
+COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
 SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
     scores_df = pd.read_csv(
         f"./llm-perf-dataset/reports/average_scores.csv")
     bench_df = bench_df.merge(scores_df, on="model", how="left")
+    bench_df["average"] = bench_df["average"].apply(
+        make_clickable_score)
     # preprocess
     bench_df["model"] = bench_df["model"].apply(make_clickable_model)
     return bench_df
 def submit_query(text, backends, datatypes, threshold, raw_df):
     # extract the average score (float) from the clickable score (clickable markdown)
+    raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
+        extract_score_from_clickable)
     filtered_df = raw_df[
         raw_df["Model 🤗"].str.lower().str.contains(text.lower()) &
         raw_df["Backend 🏭"].isin(backends) &
         raw_df["Datatype 📥"].isin(datatypes) &
         (raw_df["Average H4 Score ⬆️"] >= threshold)
     ]
+    filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
+        make_clickable_score)
     return filtered_df
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    # controls
     with gr.Row():
         search_bar = gr.Textbox(
             label="Model 🤗",
             elem_id="submit-button",
         )
+    # leaderboard tabs
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🖥️ A100-80GB Benchmark 🏋️", elem_id="A100-benchmark", id=0):
             gr.HTML(SINGLE_A100_TEXT)
                 elem_id="citation-button",
             ).style(show_copy_button=True)
 # Restart space every hour
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600,

src/assets/css_html_js.py CHANGED Viewed

@@ -1,12 +1,4 @@
 custom_css = """
-#changelog-text {
-    font-size: 16px !important;
-}
-#changelog-text h2 {
-    font-size: 18px !important;
-}
 .markdown-text {
     font-size: 16px !important;
 }
@@ -28,26 +20,11 @@ custom_css = """
     transform: scale(1.3);
 }
-#leaderboard-table {
-    margin-top: 15px
-}
-#leaderboard-table-lite {
-    margin-top: 15px
-}
 #search-bar-table-box > div:first-child {
     background: none;
     border: none;
 }
-/* Hides the final AutoEvalColumn */
-#llm-benchmark-tab-table table td:last-child,
-#llm-benchmark-tab-table table th:last-child {
-    display: none;
-}
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {
@@ -59,19 +36,6 @@ table th:first-child {
 .tab-buttons button {
     font-size: 20px;
 }
-#scale-logo {
-    border-style: none !important;
-    box-shadow: none;
-    display: block;
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 600px;
-}
-#scale-logo .download {
-    display: none;
-}
 """
 get_window_url_params = """

 custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
     transform: scale(1.3);
 }
 #search-bar-table-box > div:first-child {
     background: none;
     border: none;
 }
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {
 .tab-buttons button {
     font-size: 20px;
 }
 """
 get_window_url_params = """

src/assets/text_content.py CHANGED Viewed

@@ -1,22 +1,20 @@
 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM-Perf Leaderboard 🏋️</h1>"""
 INTRODUCTION_TEXT = f"""
-The 🤗 Open LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) on different hardwares and backends using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
-Anyone from the community can request a model or a hardware+backend configuration for automated benchmarking:
-- Model requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the 🤗 Open LLM-Perf Leaderboard 🏋️ once they're publicly available.
-- Hardware+Backend requests should be made in the 🤗 Open LLM-Perf Leaderboard 🏋️ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions).
-[Config files](https://github.com/huggingface/optimum-benchmark/blob/main/examples/bert.yaml) (which can be used with Optimum-Benchmark) will be available soon for reproduction, questioning and correction of our results.
 """
-SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
 <ul>
     <li>Singleton Batch (1)</li>
     <li>Thousand Tokens (1000)</li>
 </ul>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
 CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
   author = {Ilyas Moutawwakil},
@@ -25,8 +23,8 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
   publisher = {Hugging Face},
   howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
 @software{optimum-benchmark,
-  author       = {Ilyas Moutawwakil},
   publisher = {Hugging Face},
-  title        = {A framework for benchmarking the performance of Transformers models on different hardwares and backends},
 }
 """

 TITLE = """<h1 align="center" id="space-title">🤗 Open LLM-Perf Leaderboard 🏋️</h1>"""
 INTRODUCTION_TEXT = f"""
+The 🤗 Open LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
+Anyone from the community can request a model or a hardware+backend+optimization configuration for automated benchmarking:
+- Model requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the 🤗 Open LLM-Perf Leaderboard 🏋️ automatically once they're publicly available. That's mostly because we don't want to benchmark models that don't have an evaluation score yet.
+- Hardware+Backend+Optimization requests should be made in the 🤗 Open LLM-Perf Leaderboard 🏋️ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
 """
+SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
 <ul>
     <li>Singleton Batch (1)</li>
     <li>Thousand Tokens (1000)</li>
 </ul>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
 CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
   author = {Ilyas Moutawwakil},
   publisher = {Hugging Face},
   howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
 @software{optimum-benchmark,
+  author    = {Ilyas Moutawwakil},
   publisher = {Hugging Face},
+  title     = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.},
 }
 """