Spaces:
Running
Running
add intel CPU to leaderboard
#31
by
baptistecolle
HF staff
- opened
- .gitignore +1 -4
- README.md +1 -59
- app.py +19 -47
- hardware.yaml +0 -50
- requirements.txt +2 -3
- src/content.py +6 -6
- src/dependency.py +0 -3
- src/hardware.py +0 -26
- src/kernels.py +1 -8
- src/llm_perf.py +15 -41
- src/panel.py +46 -91
- src/utils.py +0 -5
.gitignore
CHANGED
@@ -4,7 +4,4 @@ __pycache__/
|
|
4 |
*ipynb
|
5 |
.vscode/
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
dataset/
|
10 |
-
.venv
|
|
|
4 |
*ipynb
|
5 |
.vscode/
|
6 |
|
7 |
+
dataset/
|
|
|
|
|
|
README.md
CHANGED
@@ -11,62 +11,4 @@ license: apache-2.0
|
|
11 |
tags: [llm perf leaderboard, llm performance leaderboard, llm, performance, leaderboard]
|
12 |
---
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
## π About
|
17 |
-
The π€ LLM-Perf Leaderboard ποΈ is a laderboard at the intersection of quality and performance.
|
18 |
-
Its aim is to benchmark the performance (latency, throughput, memory & energy)
|
19 |
-
of Large Language Models (LLMs) with different hardwares, backends and optimizations
|
20 |
-
using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
|
21 |
-
|
22 |
-
Anyone from the community can request a new base model or hardware/backend/optimization
|
23 |
-
configuration for automated benchmarking:
|
24 |
-
|
25 |
-
- Model evaluation requests should be made in the
|
26 |
-
[π€ Open LLM Leaderboard π
](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
|
27 |
-
we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
|
28 |
-
- Hardware/Backend/Optimization configuration requests should be made in the
|
29 |
-
[π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
|
30 |
-
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
|
31 |
-
|
32 |
-
## βοΈ Details
|
33 |
-
|
34 |
-
- To avoid communication-dependent results, only one GPU is used.
|
35 |
-
- Score is the average evaluation score obtained from the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
36 |
-
- LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
|
37 |
-
- Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
|
38 |
-
- We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
|
39 |
-
|
40 |
-
All of our benchmarks are ran by this single script
|
41 |
-
[benchmark_cuda_pytorch.py](https://github.com/huggingface/optimum-benchmark/blob/llm-perf/llm-perf/benchmark_cuda_pytorch.py)
|
42 |
-
using the power of [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) to garantee reproducibility and consistency.
|
43 |
-
|
44 |
-
## π How to run locally
|
45 |
-
|
46 |
-
To run the LLM-Perf Leaderboard locally on your machine, follow these steps:
|
47 |
-
|
48 |
-
### 1. Clone the Repository
|
49 |
-
|
50 |
-
First, clone the repository to your local machine:
|
51 |
-
|
52 |
-
```bash
|
53 |
-
git clone https://huggingface.co/spaces/optimum/llm-perf-leaderboard
|
54 |
-
cd llm-perf-leaderboard
|
55 |
-
```
|
56 |
-
|
57 |
-
### 2. Install the Required Dependencies
|
58 |
-
|
59 |
-
Install the necessary Python packages listed in the requirements.txt file:
|
60 |
-
`pip install -r requirements.txt`
|
61 |
-
|
62 |
-
### 3. Run the Application
|
63 |
-
|
64 |
-
You can run the Gradio application in one of the following ways:
|
65 |
-
- Option 1: Using Python
|
66 |
-
`python app.py`
|
67 |
-
- Option 2: Using Gradio CLI (include hot-reload)
|
68 |
-
`gradio app.py`
|
69 |
-
|
70 |
-
### 4. Access the Application
|
71 |
-
|
72 |
-
Once the application is running, you can access it locally in your web browser at http://127.0.0.1:7860/
|
|
|
11 |
tags: [llm perf leaderboard, llm performance leaderboard, llm, performance, leaderboard]
|
12 |
---
|
13 |
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
import src.dependency # noqa
|
4 |
from src.assets import custom_css
|
5 |
|
6 |
# from src.attention import create_attn_plots
|
7 |
from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
|
8 |
-
from src.hardware import load_hardware_configs
|
9 |
from src.leaderboard import create_leaderboard_table
|
10 |
from src.llm_perf import get_llm_perf_df
|
11 |
from src.map import create_lat_score_mem_plot
|
@@ -15,31 +13,27 @@ from src.panel import (
|
|
15 |
create_select_callback,
|
16 |
)
|
17 |
|
18 |
-
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
theme=gr.themes.Default(primary_hue="indigo", secondary_hue="indigo"),
|
24 |
-
)
|
25 |
with demo:
|
26 |
gr.HTML(LOGO, elem_classes="logo")
|
27 |
gr.HTML(TITLE, elem_classes="title")
|
28 |
####################### HARDWARE TABS #######################
|
29 |
with gr.Tabs(elem_classes="tabs"):
|
30 |
-
for id,
|
31 |
-
with gr.TabItem(
|
32 |
-
#######################
|
33 |
-
if config.detail:
|
34 |
-
gr.Markdown(config.detail, elem_classes="descriptive-text")
|
35 |
-
|
36 |
-
# ####################### CONTROL PANEL #######################
|
37 |
(
|
38 |
filter_button,
|
39 |
-
|
40 |
-
subsets_value,
|
41 |
-
backends_value,
|
42 |
-
hardware_type_value,
|
43 |
score_slider,
|
44 |
memory_slider,
|
45 |
backend_checkboxes,
|
@@ -47,33 +41,17 @@ with demo:
|
|
47 |
optimization_checkboxes,
|
48 |
quantization_checkboxes,
|
49 |
kernels_checkboxes,
|
50 |
-
) = create_control_panel(
|
51 |
-
machine=config.machine,
|
52 |
-
subsets=config.subsets,
|
53 |
-
backends=config.backends,
|
54 |
-
hardware_type=config.hardware_type,
|
55 |
-
hardware_provider=config.hardware_provider,
|
56 |
-
)
|
57 |
####################### HARDWARE SUBTABS #######################
|
58 |
with gr.Tabs(elem_classes="subtabs"):
|
59 |
-
open_llm_perf_df = get_llm_perf_df(
|
60 |
-
machine=config.machine,
|
61 |
-
subsets=config.subsets,
|
62 |
-
backends=config.backends,
|
63 |
-
hardware_type=config.hardware_type,
|
64 |
-
)
|
65 |
####################### LEADERBOARD TAB #######################
|
66 |
with gr.TabItem("Leaderboard π
", id=0):
|
67 |
search_bar, columns_checkboxes, leaderboard_table = (
|
68 |
create_leaderboard_table(open_llm_perf_df)
|
69 |
)
|
70 |
-
|
71 |
-
|
72 |
-
): # TODO intel CPU does not measure the memory requirements correctly, so disable the graph feature until we fix the underlying issue
|
73 |
-
with gr.TabItem("Find Your Best Model π§", id=1):
|
74 |
-
lat_score_mem_plot = create_lat_score_mem_plot(
|
75 |
-
open_llm_perf_df
|
76 |
-
)
|
77 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
78 |
# with gr.TabItem("Attention π", id=2):
|
79 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
@@ -89,10 +67,7 @@ with demo:
|
|
89 |
create_control_callback(
|
90 |
filter_button,
|
91 |
# inputs
|
92 |
-
|
93 |
-
subsets_value,
|
94 |
-
backends_value,
|
95 |
-
hardware_type_value,
|
96 |
score_slider,
|
97 |
memory_slider,
|
98 |
backend_checkboxes,
|
@@ -114,10 +89,7 @@ with demo:
|
|
114 |
|
115 |
create_select_callback(
|
116 |
# inputs
|
117 |
-
|
118 |
-
subsets_value,
|
119 |
-
backends_value,
|
120 |
-
hardware_type_value,
|
121 |
# interactive
|
122 |
columns_checkboxes,
|
123 |
search_bar,
|
@@ -126,7 +98,7 @@ with demo:
|
|
126 |
)
|
127 |
|
128 |
####################### ABOUT TAB #######################
|
129 |
-
with gr.TabItem("About π", id=
|
130 |
gr.Markdown(ABOUT, elem_classes="descriptive-text")
|
131 |
####################### CITATION
|
132 |
with gr.Row():
|
|
|
1 |
import gradio as gr
|
2 |
|
|
|
3 |
from src.assets import custom_css
|
4 |
|
5 |
# from src.attention import create_attn_plots
|
6 |
from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
|
|
|
7 |
from src.leaderboard import create_leaderboard_table
|
8 |
from src.llm_perf import get_llm_perf_df
|
9 |
from src.map import create_lat_score_mem_plot
|
|
|
13 |
create_select_callback,
|
14 |
)
|
15 |
|
16 |
+
# from custom_kernels import create_quant_krnl_plots
|
17 |
|
18 |
+
MACHINE_TO_HARDWARE = {
|
19 |
+
"1xA10": "A10-24GB-150W π₯οΈ",
|
20 |
+
"1xA100": "A100-80GB-275W π₯οΈ",
|
21 |
+
# "1xH100": "H100-80GB-700W π₯οΈ",
|
22 |
+
}
|
23 |
|
24 |
+
|
25 |
+
demo = gr.Blocks(css=custom_css)
|
|
|
|
|
26 |
with demo:
|
27 |
gr.HTML(LOGO, elem_classes="logo")
|
28 |
gr.HTML(TITLE, elem_classes="title")
|
29 |
####################### HARDWARE TABS #######################
|
30 |
with gr.Tabs(elem_classes="tabs"):
|
31 |
+
for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
|
32 |
+
with gr.TabItem(hardware, id=id):
|
33 |
+
####################### CONTROL PANEL #######################
|
|
|
|
|
|
|
|
|
34 |
(
|
35 |
filter_button,
|
36 |
+
machine_textbox,
|
|
|
|
|
|
|
37 |
score_slider,
|
38 |
memory_slider,
|
39 |
backend_checkboxes,
|
|
|
41 |
optimization_checkboxes,
|
42 |
quantization_checkboxes,
|
43 |
kernels_checkboxes,
|
44 |
+
) = create_control_panel(machine=machine)
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
####################### HARDWARE SUBTABS #######################
|
46 |
with gr.Tabs(elem_classes="subtabs"):
|
47 |
+
open_llm_perf_df = get_llm_perf_df(machine=machine)
|
|
|
|
|
|
|
|
|
|
|
48 |
####################### LEADERBOARD TAB #######################
|
49 |
with gr.TabItem("Leaderboard π
", id=0):
|
50 |
search_bar, columns_checkboxes, leaderboard_table = (
|
51 |
create_leaderboard_table(open_llm_perf_df)
|
52 |
)
|
53 |
+
with gr.TabItem("Find Your Best Model π§", id=1):
|
54 |
+
lat_score_mem_plot = create_lat_score_mem_plot(open_llm_perf_df)
|
|
|
|
|
|
|
|
|
|
|
55 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
56 |
# with gr.TabItem("Attention π", id=2):
|
57 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
|
|
67 |
create_control_callback(
|
68 |
filter_button,
|
69 |
# inputs
|
70 |
+
machine_textbox,
|
|
|
|
|
|
|
71 |
score_slider,
|
72 |
memory_slider,
|
73 |
backend_checkboxes,
|
|
|
89 |
|
90 |
create_select_callback(
|
91 |
# inputs
|
92 |
+
machine_textbox,
|
|
|
|
|
|
|
93 |
# interactive
|
94 |
columns_checkboxes,
|
95 |
search_bar,
|
|
|
98 |
)
|
99 |
|
100 |
####################### ABOUT TAB #######################
|
101 |
+
with gr.TabItem("About π", id=3):
|
102 |
gr.Markdown(ABOUT, elem_classes="descriptive-text")
|
103 |
####################### CITATION
|
104 |
with gr.Row():
|
hardware.yaml
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
- machine: 1xA10
|
2 |
-
description: A10-24GB-150W π₯οΈ
|
3 |
-
hardware_provider: nvidia
|
4 |
-
hardware_type: cuda
|
5 |
-
subsets:
|
6 |
-
- unquantized
|
7 |
-
- awq
|
8 |
-
- bnb
|
9 |
-
- gptq
|
10 |
-
backends:
|
11 |
-
- pytorch
|
12 |
-
|
13 |
-
- machine: 1xA100
|
14 |
-
description: A100-80GB-275W π₯οΈ
|
15 |
-
hardware_provider: nvidia
|
16 |
-
hardware_type: cuda
|
17 |
-
subsets:
|
18 |
-
- unquantized
|
19 |
-
- awq
|
20 |
-
- bnb
|
21 |
-
- gptq
|
22 |
-
- torchao
|
23 |
-
backends:
|
24 |
-
- pytorch
|
25 |
-
|
26 |
-
- machine: 1xT4
|
27 |
-
description: T4-16GB-70W π₯οΈ
|
28 |
-
hardware_provider: nvidia
|
29 |
-
hardware_type: cuda
|
30 |
-
subsets:
|
31 |
-
- unquantized
|
32 |
-
- awq
|
33 |
-
- bnb
|
34 |
-
- gptq
|
35 |
-
- torchao
|
36 |
-
backends:
|
37 |
-
- pytorch
|
38 |
-
|
39 |
-
- machine: 32vCPU-C7i
|
40 |
-
description: Intel-Xeon-SPR-385W π₯οΈ
|
41 |
-
detail: |
|
42 |
-
We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
|
43 |
-
hardware_provider: intel
|
44 |
-
hardware_type: cpu
|
45 |
-
subsets:
|
46 |
-
- unquantized
|
47 |
-
backends:
|
48 |
-
- pytorch
|
49 |
-
- openvino
|
50 |
-
- onnxruntime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
huggingface_hub
|
2 |
transformers
|
3 |
-
gradio
|
4 |
plotly
|
5 |
-
pandas
|
6 |
-
ruff
|
|
|
1 |
huggingface_hub
|
2 |
transformers
|
3 |
+
gradio
|
4 |
plotly
|
5 |
+
pandas
|
|
src/content.py
CHANGED
@@ -5,18 +5,18 @@ TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ
|
|
5 |
ABOUT = """
|
6 |
## π About
|
7 |
The π€ LLM-Perf Leaderboard ποΈ is a laderboard at the intersection of quality and performance.
|
8 |
-
Its aim is to benchmark the performance (latency, throughput, memory & energy)
|
9 |
-
of Large Language Models (LLMs) with different hardwares, backends and optimizations
|
10 |
using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
|
11 |
|
12 |
-
Anyone from the community can request a new base model or hardware/backend/optimization
|
13 |
configuration for automated benchmarking:
|
14 |
|
15 |
-
- Model evaluation requests should be made in the
|
16 |
[π€ Open LLM Leaderboard π
](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
|
17 |
we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
|
18 |
-
- Hardware/Backend/Optimization configuration requests should be made in the
|
19 |
-
[π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
|
20 |
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
|
21 |
|
22 |
## βοΈ Details
|
|
|
5 |
ABOUT = """
|
6 |
## π About
|
7 |
The π€ LLM-Perf Leaderboard ποΈ is a laderboard at the intersection of quality and performance.
|
8 |
+
Its aim is to benchmark the performance (latency, throughput, memory & energy)
|
9 |
+
of Large Language Models (LLMs) with different hardwares, backends and optimizations
|
10 |
using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
|
11 |
|
12 |
+
Anyone from the community can request a new base model or hardware/backend/optimization
|
13 |
configuration for automated benchmarking:
|
14 |
|
15 |
+
- Model evaluation requests should be made in the
|
16 |
[π€ Open LLM Leaderboard π
](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
|
17 |
we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
|
18 |
+
- Hardware/Backend/Optimization configuration requests should be made in the
|
19 |
+
[π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
|
20 |
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
|
21 |
|
22 |
## βοΈ Details
|
src/dependency.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
|
|
|
|
|
|
|
|
src/hardware.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
from typing import Any, Dict, List, Optional
|
2 |
-
|
3 |
-
import yaml
|
4 |
-
|
5 |
-
|
6 |
-
class HardwareConfig:
|
7 |
-
def __init__(self, data: Dict[str, Any]):
|
8 |
-
self.machine: str = data["machine"]
|
9 |
-
self.description: str = data["description"]
|
10 |
-
self.hardware_provider: str = data["hardware_provider"]
|
11 |
-
self.hardware_type: str = data["hardware_type"]
|
12 |
-
self.subsets: List[str] = data["subsets"]
|
13 |
-
self.backends: List[str] = data["backends"]
|
14 |
-
self.detail: Optional[str] = data.get("detail", None)
|
15 |
-
|
16 |
-
def __repr__(self) -> str:
|
17 |
-
return (
|
18 |
-
f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
|
19 |
-
f"hardware_provider={self.hardware_provider}, hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
|
20 |
-
)
|
21 |
-
|
22 |
-
|
23 |
-
def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
|
24 |
-
with open(file_path, "r") as file:
|
25 |
-
data = yaml.safe_load(file)
|
26 |
-
return [HardwareConfig(config) for config in data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/kernels.py
CHANGED
@@ -38,7 +38,6 @@ def get_quant_df(llm_perf_df):
|
|
38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
41 |
-
torchao_df = copy_df[(copy_df["Quantization ποΈ"] == "torchao.4bit")]
|
42 |
# merge the three dataframes
|
43 |
exllamav1_df = pd.merge(
|
44 |
vanilla_df,
|
@@ -64,14 +63,8 @@ def get_quant_df(llm_perf_df):
|
|
64 |
on=["Model π€"],
|
65 |
suffixes=["", " Custom Kernel"],
|
66 |
)
|
67 |
-
torchao_df = pd.merge(
|
68 |
-
vanilla_df,
|
69 |
-
torchao_df,
|
70 |
-
on=["Model π€"],
|
71 |
-
suffixes=["", " Custom Kernel"],
|
72 |
-
)
|
73 |
# concat the two dataframes row-wise
|
74 |
-
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df
|
75 |
# compute speedups
|
76 |
quant_df["Prefill Speedup (%)"] = (
|
77 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
|
|
38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
39 |
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")]
|
40 |
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")]
|
|
|
41 |
# merge the three dataframes
|
42 |
exllamav1_df = pd.merge(
|
43 |
vanilla_df,
|
|
|
63 |
on=["Model π€"],
|
64 |
suffixes=["", " Custom Kernel"],
|
65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# concat the two dataframes row-wise
|
67 |
+
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
68 |
# compute speedups
|
69 |
quant_df["Prefill Speedup (%)"] = (
|
70 |
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
src/llm_perf.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
import os
|
2 |
-
from typing import List
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
from .utils import process_kernels, process_quantizations
|
7 |
|
8 |
-
DATASET_DIRECTORY = "dataset"
|
9 |
-
|
10 |
COLUMNS_MAPPING = {
|
11 |
"config.name": "Experiment π§ͺ",
|
12 |
"config.backend.model": "Model π€",
|
@@ -29,34 +26,21 @@ COLUMNS_MAPPING = {
|
|
29 |
"#Params (B)": "Params (B)",
|
30 |
}
|
31 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
|
|
32 |
SORTING_ASCENDING = [False, True, False]
|
33 |
|
34 |
|
35 |
-
def get_raw_llm_perf_df(
|
36 |
-
machine: str, subsets: List[str], backends: List[str], hardware_type: str
|
37 |
-
):
|
38 |
dfs = []
|
39 |
-
for subset in
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
|
45 |
-
)
|
46 |
)
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
print(f" β’ Subset: {subset}")
|
51 |
-
print(f" β’ Machine: {machine}")
|
52 |
-
print(f" β’ Hardware Type: {hardware_type}")
|
53 |
-
url = f"https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard/blob/main/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
|
54 |
-
print(f" β’ URL: {url}")
|
55 |
-
|
56 |
-
if len(dfs) == 0:
|
57 |
-
raise ValueError(
|
58 |
-
f"No datasets found for machine {machine}, check your hardware.yml config file or your datatset on huggingface"
|
59 |
-
)
|
60 |
|
61 |
perf_df = pd.concat(dfs)
|
62 |
llm_df = pd.read_csv(
|
@@ -124,22 +108,12 @@ def processed_llm_perf_df(llm_perf_df):
|
|
124 |
return llm_perf_df
|
125 |
|
126 |
|
127 |
-
def get_llm_perf_df(
|
128 |
-
machine:
|
129 |
-
)
|
130 |
-
if not os.path.exists(DATASET_DIRECTORY):
|
131 |
-
os.makedirs(DATASET_DIRECTORY)
|
132 |
-
|
133 |
-
if os.path.exists(f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"):
|
134 |
-
llm_perf_df = pd.read_csv(
|
135 |
-
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"
|
136 |
-
)
|
137 |
else:
|
138 |
-
|
139 |
-
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
|
140 |
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
141 |
-
llm_perf_df.to_csv(
|
142 |
-
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
143 |
-
)
|
144 |
|
145 |
return llm_perf_df
|
|
|
1 |
import os
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
5 |
from .utils import process_kernels, process_quantizations
|
6 |
|
|
|
|
|
7 |
COLUMNS_MAPPING = {
|
8 |
"config.name": "Experiment π§ͺ",
|
9 |
"config.backend.model": "Model π€",
|
|
|
26 |
"#Params (B)": "Params (B)",
|
27 |
}
|
28 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
29 |
+
SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
|
30 |
SORTING_ASCENDING = [False, True, False]
|
31 |
|
32 |
|
33 |
+
def get_raw_llm_perf_df(machine: str = "1xA10"):
|
|
|
|
|
34 |
dfs = []
|
35 |
+
for subset in SUBSETS:
|
36 |
+
try:
|
37 |
+
dfs.append(
|
38 |
+
pd.read_csv(
|
39 |
+
f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{subset}-{machine}.csv"
|
|
|
|
|
40 |
)
|
41 |
+
)
|
42 |
+
except Exception:
|
43 |
+
print(f"Subset {subset} for machine {machine} not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
perf_df = pd.concat(dfs)
|
46 |
llm_df = pd.read_csv(
|
|
|
108 |
return llm_perf_df
|
109 |
|
110 |
|
111 |
+
def get_llm_perf_df(machine: str = "1xA10"):
|
112 |
+
if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
|
113 |
+
llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
else:
|
115 |
+
llm_perf_df = get_raw_llm_perf_df(machine)
|
|
|
116 |
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
117 |
+
llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
|
|
|
|
|
118 |
|
119 |
return llm_perf_df
|
src/panel.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
from typing import List
|
2 |
-
|
3 |
import gradio as gr
|
4 |
|
5 |
from src.leaderboard import get_leaderboard_df
|
@@ -10,38 +8,9 @@ from src.llm_perf import get_llm_perf_df
|
|
10 |
from src.map import get_lat_score_mem_fig
|
11 |
|
12 |
|
13 |
-
def create_control_panel(
|
14 |
-
machine: str,
|
15 |
-
subsets: List[str],
|
16 |
-
backends: List[str],
|
17 |
-
hardware_provider: str,
|
18 |
-
hardware_type: str,
|
19 |
-
):
|
20 |
# controls
|
21 |
-
|
22 |
-
subsets_value = gr.State(value=subsets)
|
23 |
-
backends_value = gr.State(value=backends)
|
24 |
-
hardware_type_value = gr.State(value=hardware_type)
|
25 |
-
|
26 |
-
if hardware_provider == "nvidia":
|
27 |
-
backends = ["pytorch"]
|
28 |
-
attention_implementations = ["Eager", "SDPA", "FAv2"]
|
29 |
-
quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
|
30 |
-
kernels = [
|
31 |
-
"No Kernel",
|
32 |
-
"GPTQ.ExllamaV1",
|
33 |
-
"GPTQ.ExllamaV2",
|
34 |
-
"AWQ.GEMM",
|
35 |
-
"AWQ.GEMV",
|
36 |
-
]
|
37 |
-
elif hardware_provider == "intel":
|
38 |
-
backends = ["pytorch", "onnxruntime", "openvino"]
|
39 |
-
attention_implementations = ["Eager"]
|
40 |
-
quantizations = ["Unquantized"]
|
41 |
-
kernels = ["No Kernel"]
|
42 |
-
else:
|
43 |
-
raise ValueError(f"Unknown hardware provider: {hardware_provider}")
|
44 |
-
|
45 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
46 |
with gr.Row():
|
47 |
with gr.Column(scale=2, variant="panel"):
|
@@ -63,8 +32,8 @@ def create_control_panel(
|
|
63 |
with gr.Column(scale=1, variant="panel"):
|
64 |
backend_checkboxes = gr.CheckboxGroup(
|
65 |
label="Backends π",
|
66 |
-
choices=
|
67 |
-
value=
|
68 |
info="βοΈ Select the backends",
|
69 |
elem_id="backend-checkboxes",
|
70 |
)
|
@@ -80,8 +49,8 @@ def create_control_panel(
|
|
80 |
with gr.Column(scale=1, variant="panel"):
|
81 |
optimization_checkboxes = gr.CheckboxGroup(
|
82 |
label="Attentions ποΈ",
|
83 |
-
choices=
|
84 |
-
value=
|
85 |
info="βοΈ Select the optimization",
|
86 |
elem_id="optimization-checkboxes",
|
87 |
)
|
@@ -89,8 +58,20 @@ def create_control_panel(
|
|
89 |
with gr.Column(scale=1, variant="panel"):
|
90 |
quantization_checkboxes = gr.CheckboxGroup(
|
91 |
label="Quantizations ποΈ",
|
92 |
-
choices=
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
info="βοΈ Select the quantization schemes",
|
95 |
elem_id="quantization-checkboxes",
|
96 |
elem_classes="boxed-option",
|
@@ -98,8 +79,20 @@ def create_control_panel(
|
|
98 |
with gr.Column(scale=1, variant="panel"):
|
99 |
kernels_checkboxes = gr.CheckboxGroup(
|
100 |
label="Kernels βοΈ",
|
101 |
-
choices=
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
info="βοΈ Select the custom kernels",
|
104 |
elem_id="kernel-checkboxes",
|
105 |
elem_classes="boxed-option",
|
@@ -113,10 +106,7 @@ def create_control_panel(
|
|
113 |
|
114 |
return (
|
115 |
filter_button,
|
116 |
-
|
117 |
-
backends_value,
|
118 |
-
hardware_type_value,
|
119 |
-
subsets_value,
|
120 |
score_slider,
|
121 |
memory_slider,
|
122 |
backend_checkboxes,
|
@@ -129,13 +119,10 @@ def create_control_panel(
|
|
129 |
|
130 |
def filter_rows_fn(
|
131 |
machine,
|
132 |
-
subsets,
|
133 |
-
backends,
|
134 |
-
hardware_type,
|
135 |
# inputs
|
136 |
score,
|
137 |
memory,
|
138 |
-
|
139 |
precisions,
|
140 |
attentions,
|
141 |
quantizations,
|
@@ -144,14 +131,12 @@ def filter_rows_fn(
|
|
144 |
columns,
|
145 |
search,
|
146 |
):
|
147 |
-
llm_perf_df = get_llm_perf_df(
|
148 |
-
machine=machine, subsets=subsets, backends=backends, hardware_type=hardware_type
|
149 |
-
)
|
150 |
# print(attentions)
|
151 |
# print(llm_perf_df["Attention ποΈ"].unique())
|
152 |
filtered_llm_perf_df = llm_perf_df[
|
153 |
llm_perf_df["Model π€"].str.contains(search, case=False)
|
154 |
-
& llm_perf_df["Backend π"].isin(
|
155 |
& llm_perf_df["Precision π₯"].isin(precisions)
|
156 |
& llm_perf_df["Attention ποΈ"].isin(attentions)
|
157 |
& llm_perf_df["Quantization ποΈ"].isin(quantizations)
|
@@ -160,7 +145,7 @@ def filter_rows_fn(
|
|
160 |
& (llm_perf_df["Memory (MB)"] <= memory)
|
161 |
]
|
162 |
selected_filtered_llm_perf_df = select_columns_fn(
|
163 |
-
machine,
|
164 |
)
|
165 |
selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df)
|
166 |
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
@@ -186,10 +171,7 @@ def create_control_callback(
|
|
186 |
# button
|
187 |
filter_button,
|
188 |
# fixed
|
189 |
-
|
190 |
-
subsets_value,
|
191 |
-
backends_value,
|
192 |
-
hardware_type_value,
|
193 |
# inputs
|
194 |
score_slider,
|
195 |
memory_slider,
|
@@ -215,10 +197,7 @@ def create_control_callback(
|
|
215 |
fn=filter_rows_fn,
|
216 |
inputs=[
|
217 |
# fixed
|
218 |
-
|
219 |
-
subsets_value,
|
220 |
-
backends_value,
|
221 |
-
hardware_type_value,
|
222 |
# inputs
|
223 |
score_slider,
|
224 |
memory_slider,
|
@@ -244,16 +223,9 @@ def create_control_callback(
|
|
244 |
)
|
245 |
|
246 |
|
247 |
-
def select_columns_fn(
|
248 |
-
machine, subsets, backends, hardware_type, columns, search, llm_perf_df=None
|
249 |
-
):
|
250 |
if llm_perf_df is None:
|
251 |
-
llm_perf_df = get_llm_perf_df(
|
252 |
-
machine=machine,
|
253 |
-
subsets=subsets,
|
254 |
-
backends=backends,
|
255 |
-
hardware_type=hardware_type,
|
256 |
-
)
|
257 |
|
258 |
selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
|
259 |
selected_leaderboard_df = selected_leaderboard_df[
|
@@ -266,10 +238,7 @@ def select_columns_fn(
|
|
266 |
|
267 |
def create_select_callback(
|
268 |
# fixed
|
269 |
-
|
270 |
-
subsets_value,
|
271 |
-
backends_value,
|
272 |
-
hardware_type_value,
|
273 |
# interactive
|
274 |
columns_checkboxes,
|
275 |
search_bar,
|
@@ -278,25 +247,11 @@ def create_select_callback(
|
|
278 |
):
|
279 |
columns_checkboxes.change(
|
280 |
fn=select_columns_fn,
|
281 |
-
inputs=[
|
282 |
-
machine_value,
|
283 |
-
subsets_value,
|
284 |
-
backends_value,
|
285 |
-
hardware_type_value,
|
286 |
-
columns_checkboxes,
|
287 |
-
search_bar,
|
288 |
-
],
|
289 |
outputs=[leaderboard_table],
|
290 |
)
|
291 |
search_bar.change(
|
292 |
fn=select_columns_fn,
|
293 |
-
inputs=[
|
294 |
-
machine_value,
|
295 |
-
subsets_value,
|
296 |
-
backends_value,
|
297 |
-
hardware_type_value,
|
298 |
-
columns_checkboxes,
|
299 |
-
search_bar,
|
300 |
-
],
|
301 |
outputs=[leaderboard_table],
|
302 |
)
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
from src.leaderboard import get_leaderboard_df
|
|
|
8 |
from src.map import get_lat_score_mem_fig
|
9 |
|
10 |
|
11 |
+
def create_control_panel(machine: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# controls
|
13 |
+
machine_textbox = gr.Textbox(value=machine, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
15 |
with gr.Row():
|
16 |
with gr.Column(scale=2, variant="panel"):
|
|
|
32 |
with gr.Column(scale=1, variant="panel"):
|
33 |
backend_checkboxes = gr.CheckboxGroup(
|
34 |
label="Backends π",
|
35 |
+
choices=["pytorch"],
|
36 |
+
value=["pytorch"],
|
37 |
info="βοΈ Select the backends",
|
38 |
elem_id="backend-checkboxes",
|
39 |
)
|
|
|
49 |
with gr.Column(scale=1, variant="panel"):
|
50 |
optimization_checkboxes = gr.CheckboxGroup(
|
51 |
label="Attentions ποΈ",
|
52 |
+
choices=["Eager", "SDPA", "FAv2"],
|
53 |
+
value=["Eager", "SDPA", "FAv2"],
|
54 |
info="βοΈ Select the optimization",
|
55 |
elem_id="optimization-checkboxes",
|
56 |
)
|
|
|
58 |
with gr.Column(scale=1, variant="panel"):
|
59 |
quantization_checkboxes = gr.CheckboxGroup(
|
60 |
label="Quantizations ποΈ",
|
61 |
+
choices=[
|
62 |
+
"Unquantized",
|
63 |
+
"BnB.4bit",
|
64 |
+
"BnB.8bit",
|
65 |
+
"AWQ.4bit",
|
66 |
+
"GPTQ.4bit",
|
67 |
+
],
|
68 |
+
value=[
|
69 |
+
"Unquantized",
|
70 |
+
"BnB.4bit",
|
71 |
+
"BnB.8bit",
|
72 |
+
"AWQ.4bit",
|
73 |
+
"GPTQ.4bit",
|
74 |
+
],
|
75 |
info="βοΈ Select the quantization schemes",
|
76 |
elem_id="quantization-checkboxes",
|
77 |
elem_classes="boxed-option",
|
|
|
79 |
with gr.Column(scale=1, variant="panel"):
|
80 |
kernels_checkboxes = gr.CheckboxGroup(
|
81 |
label="Kernels βοΈ",
|
82 |
+
choices=[
|
83 |
+
"No Kernel",
|
84 |
+
"GPTQ.ExllamaV1",
|
85 |
+
"GPTQ.ExllamaV2",
|
86 |
+
"AWQ.GEMM",
|
87 |
+
"AWQ.GEMV",
|
88 |
+
],
|
89 |
+
value=[
|
90 |
+
"No Kernel",
|
91 |
+
"GPTQ.ExllamaV1",
|
92 |
+
"GPTQ.ExllamaV2",
|
93 |
+
"AWQ.GEMM",
|
94 |
+
"AWQ.GEMV",
|
95 |
+
],
|
96 |
info="βοΈ Select the custom kernels",
|
97 |
elem_id="kernel-checkboxes",
|
98 |
elem_classes="boxed-option",
|
|
|
106 |
|
107 |
return (
|
108 |
filter_button,
|
109 |
+
machine_textbox,
|
|
|
|
|
|
|
110 |
score_slider,
|
111 |
memory_slider,
|
112 |
backend_checkboxes,
|
|
|
119 |
|
120 |
def filter_rows_fn(
|
121 |
machine,
|
|
|
|
|
|
|
122 |
# inputs
|
123 |
score,
|
124 |
memory,
|
125 |
+
backends,
|
126 |
precisions,
|
127 |
attentions,
|
128 |
quantizations,
|
|
|
131 |
columns,
|
132 |
search,
|
133 |
):
|
134 |
+
llm_perf_df = get_llm_perf_df(machine=machine)
|
|
|
|
|
135 |
# print(attentions)
|
136 |
# print(llm_perf_df["Attention ποΈ"].unique())
|
137 |
filtered_llm_perf_df = llm_perf_df[
|
138 |
llm_perf_df["Model π€"].str.contains(search, case=False)
|
139 |
+
& llm_perf_df["Backend π"].isin(backends)
|
140 |
& llm_perf_df["Precision π₯"].isin(precisions)
|
141 |
& llm_perf_df["Attention ποΈ"].isin(attentions)
|
142 |
& llm_perf_df["Quantization ποΈ"].isin(quantizations)
|
|
|
145 |
& (llm_perf_df["Memory (MB)"] <= memory)
|
146 |
]
|
147 |
selected_filtered_llm_perf_df = select_columns_fn(
|
148 |
+
machine, columns, search, filtered_llm_perf_df
|
149 |
)
|
150 |
selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df)
|
151 |
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
|
|
171 |
# button
|
172 |
filter_button,
|
173 |
# fixed
|
174 |
+
machine_textbox,
|
|
|
|
|
|
|
175 |
# inputs
|
176 |
score_slider,
|
177 |
memory_slider,
|
|
|
197 |
fn=filter_rows_fn,
|
198 |
inputs=[
|
199 |
# fixed
|
200 |
+
machine_textbox,
|
|
|
|
|
|
|
201 |
# inputs
|
202 |
score_slider,
|
203 |
memory_slider,
|
|
|
223 |
)
|
224 |
|
225 |
|
226 |
+
def select_columns_fn(machine, columns, search, llm_perf_df=None):
|
|
|
|
|
227 |
if llm_perf_df is None:
|
228 |
+
llm_perf_df = get_llm_perf_df(machine=machine)
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
|
231 |
selected_leaderboard_df = selected_leaderboard_df[
|
|
|
238 |
|
239 |
def create_select_callback(
|
240 |
# fixed
|
241 |
+
machine_textbox,
|
|
|
|
|
|
|
242 |
# interactive
|
243 |
columns_checkboxes,
|
244 |
search_bar,
|
|
|
247 |
):
|
248 |
columns_checkboxes.change(
|
249 |
fn=select_columns_fn,
|
250 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
outputs=[leaderboard_table],
|
252 |
)
|
253 |
search_bar.change(
|
254 |
fn=select_columns_fn,
|
255 |
+
inputs=[machine_textbox, columns_checkboxes, search_bar],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
outputs=[leaderboard_table],
|
257 |
)
|
src/utils.py
CHANGED
@@ -70,11 +70,6 @@ def process_quantizations(x):
|
|
70 |
and x["config.backend.quantization_config.bits"] == 4
|
71 |
):
|
72 |
return "AWQ.4bit"
|
73 |
-
elif (
|
74 |
-
x["config.backend.quantization_scheme"] == "torchao"
|
75 |
-
and x["config.backend.quantization_config.quant_type"] == "int4_weight_only"
|
76 |
-
):
|
77 |
-
return "torchao.4bit"
|
78 |
else:
|
79 |
return "Unquantized"
|
80 |
|
|
|
70 |
and x["config.backend.quantization_config.bits"] == 4
|
71 |
):
|
72 |
return "AWQ.4bit"
|
|
|
|
|
|
|
|
|
|
|
73 |
else:
|
74 |
return "Unquantized"
|
75 |
|