Spaces:

optimum
/

llm-perf-leaderboard

Running

BenchmarkBot commited on Sep 3, 2023

Commit

d574374

•

1 Parent(s): 3c37eb3

sort by score

Files changed (2) hide show

app.py CHANGED Viewed

@@ -58,7 +58,8 @@ ALL_COLUMNS_DATATYPES = [
     #
     "markdown",
 ]
-SORTING_COLUMN = ["generate.throughput(tokens/s)"]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
@@ -110,7 +111,7 @@ def get_benchmark_table(bench_df):
         axis=1,
     )
     # sort
-    copy_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
     # filter
     copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
     # rename

     #
     "markdown",
 ]
+SORTING_COLUMN = ["best_score"]
+SORTING_ASCENDING = [False]
 llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
         axis=1,
     )
     # sort
+    copy_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
     # filter
     copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
     # rename

src/assets/text_content.py CHANGED Viewed

@@ -11,10 +11,11 @@ Anyone from the community can request a model or a hardware/backend/optimization
 ABOUT_TEXT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
-    <li>LLMs are evaluated on a singleton batch with a prompt size of 512 and generating 1000 tokens.</li>
-    <li>Peak memory is measured in MB during the generate pass with py3nvml while assuring the GPU's isolation.</li>
-    <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
 </ul>
 """

 ABOUT_TEXT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
+    <li>LLMs are running on a singleton batch with a prompt size of 512 and generating a 1000 tokens.</li>
+    <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
+    <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
+    <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
 </ul>
 """