add intel CPU to leaderboard

#31
by baptistecolle HF staff - opened
Files changed (12) hide show
  1. .gitignore +1 -4
  2. README.md +1 -59
  3. app.py +19 -47
  4. hardware.yaml +0 -50
  5. requirements.txt +2 -3
  6. src/content.py +6 -6
  7. src/dependency.py +0 -3
  8. src/hardware.py +0 -26
  9. src/kernels.py +1 -8
  10. src/llm_perf.py +15 -41
  11. src/panel.py +46 -91
  12. src/utils.py +0 -5
.gitignore CHANGED
@@ -4,7 +4,4 @@ __pycache__/
4
  *ipynb
5
  .vscode/
6
 
7
- work-in-progress/
8
-
9
- dataset/
10
- .venv
 
4
  *ipynb
5
  .vscode/
6
 
7
+ dataset/
 
 
 
README.md CHANGED
@@ -11,62 +11,4 @@ license: apache-2.0
11
  tags: [llm perf leaderboard, llm performance leaderboard, llm, performance, leaderboard]
12
  ---
13
 
14
- # LLM-perf leaderboard
15
-
16
- ## πŸ“ About
17
- The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ is a laderboard at the intersection of quality and performance.
18
- Its aim is to benchmark the performance (latency, throughput, memory & energy)
19
- of Large Language Models (LLMs) with different hardwares, backends and optimizations
20
- using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
21
-
22
- Anyone from the community can request a new base model or hardware/backend/optimization
23
- configuration for automated benchmarking:
24
-
25
- - Model evaluation requests should be made in the
26
- [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
27
- we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
28
- - Hardware/Backend/Optimization configuration requests should be made in the
29
- [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
30
- [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
31
-
32
- ## ✍️ Details
33
-
34
- - To avoid communication-dependent results, only one GPU is used.
35
- - Score is the average evaluation score obtained from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
36
- - LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
37
- - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
38
- - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
39
-
40
- All of our benchmarks are ran by this single script
41
- [benchmark_cuda_pytorch.py](https://github.com/huggingface/optimum-benchmark/blob/llm-perf/llm-perf/benchmark_cuda_pytorch.py)
42
- using the power of [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) to garantee reproducibility and consistency.
43
-
44
- ## πŸƒ How to run locally
45
-
46
- To run the LLM-Perf Leaderboard locally on your machine, follow these steps:
47
-
48
- ### 1. Clone the Repository
49
-
50
- First, clone the repository to your local machine:
51
-
52
- ```bash
53
- git clone https://huggingface.co/spaces/optimum/llm-perf-leaderboard
54
- cd llm-perf-leaderboard
55
- ```
56
-
57
- ### 2. Install the Required Dependencies
58
-
59
- Install the necessary Python packages listed in the requirements.txt file:
60
- `pip install -r requirements.txt`
61
-
62
- ### 3. Run the Application
63
-
64
- You can run the Gradio application in one of the following ways:
65
- - Option 1: Using Python
66
- `python app.py`
67
- - Option 2: Using Gradio CLI (include hot-reload)
68
- `gradio app.py`
69
-
70
- ### 4. Access the Application
71
-
72
- Once the application is running, you can access it locally in your web browser at http://127.0.0.1:7860/
 
11
  tags: [llm perf leaderboard, llm performance leaderboard, llm, performance, leaderboard]
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import gradio as gr
2
 
3
- import src.dependency # noqa
4
  from src.assets import custom_css
5
 
6
  # from src.attention import create_attn_plots
7
  from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
8
- from src.hardware import load_hardware_configs
9
  from src.leaderboard import create_leaderboard_table
10
  from src.llm_perf import get_llm_perf_df
11
  from src.map import create_lat_score_mem_plot
@@ -15,31 +13,27 @@ from src.panel import (
15
  create_select_callback,
16
  )
17
 
18
- configs = load_hardware_configs("hardware.yaml")
19
 
 
 
 
 
 
20
 
21
- demo = gr.Blocks(
22
- css=custom_css,
23
- theme=gr.themes.Default(primary_hue="indigo", secondary_hue="indigo"),
24
- )
25
  with demo:
26
  gr.HTML(LOGO, elem_classes="logo")
27
  gr.HTML(TITLE, elem_classes="title")
28
  ####################### HARDWARE TABS #######################
29
  with gr.Tabs(elem_classes="tabs"):
30
- for id, config in enumerate(configs):
31
- with gr.TabItem(config.description, id=id):
32
- ####################### HARDWARE DETAILS #######################
33
- if config.detail:
34
- gr.Markdown(config.detail, elem_classes="descriptive-text")
35
-
36
- # ####################### CONTROL PANEL #######################
37
  (
38
  filter_button,
39
- machine_value,
40
- subsets_value,
41
- backends_value,
42
- hardware_type_value,
43
  score_slider,
44
  memory_slider,
45
  backend_checkboxes,
@@ -47,33 +41,17 @@ with demo:
47
  optimization_checkboxes,
48
  quantization_checkboxes,
49
  kernels_checkboxes,
50
- ) = create_control_panel(
51
- machine=config.machine,
52
- subsets=config.subsets,
53
- backends=config.backends,
54
- hardware_type=config.hardware_type,
55
- hardware_provider=config.hardware_provider,
56
- )
57
  ####################### HARDWARE SUBTABS #######################
58
  with gr.Tabs(elem_classes="subtabs"):
59
- open_llm_perf_df = get_llm_perf_df(
60
- machine=config.machine,
61
- subsets=config.subsets,
62
- backends=config.backends,
63
- hardware_type=config.hardware_type,
64
- )
65
  ####################### LEADERBOARD TAB #######################
66
  with gr.TabItem("Leaderboard πŸ…", id=0):
67
  search_bar, columns_checkboxes, leaderboard_table = (
68
  create_leaderboard_table(open_llm_perf_df)
69
  )
70
- if (
71
- config.hardware_provider != "intel"
72
- ): # TODO intel CPU does not measure the memory requirements correctly, so disable the graph feature until we fix the underlying issue
73
- with gr.TabItem("Find Your Best Model 🧭", id=1):
74
- lat_score_mem_plot = create_lat_score_mem_plot(
75
- open_llm_perf_df
76
- )
77
  ###################### ATTENTIONS SPEEDUP TAB #######################
78
  # with gr.TabItem("Attention πŸ“ˆ", id=2):
79
  # attn_prefill_plot, attn_decode_plot = create_attn_plots(
@@ -89,10 +67,7 @@ with demo:
89
  create_control_callback(
90
  filter_button,
91
  # inputs
92
- machine_value,
93
- subsets_value,
94
- backends_value,
95
- hardware_type_value,
96
  score_slider,
97
  memory_slider,
98
  backend_checkboxes,
@@ -114,10 +89,7 @@ with demo:
114
 
115
  create_select_callback(
116
  # inputs
117
- machine_value,
118
- subsets_value,
119
- backends_value,
120
- hardware_type_value,
121
  # interactive
122
  columns_checkboxes,
123
  search_bar,
@@ -126,7 +98,7 @@ with demo:
126
  )
127
 
128
  ####################### ABOUT TAB #######################
129
- with gr.TabItem("About πŸ“–", id=len(configs)):
130
  gr.Markdown(ABOUT, elem_classes="descriptive-text")
131
  ####################### CITATION
132
  with gr.Row():
 
1
  import gradio as gr
2
 
 
3
  from src.assets import custom_css
4
 
5
  # from src.attention import create_attn_plots
6
  from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
 
7
  from src.leaderboard import create_leaderboard_table
8
  from src.llm_perf import get_llm_perf_df
9
  from src.map import create_lat_score_mem_plot
 
13
  create_select_callback,
14
  )
15
 
16
+ # from custom_kernels import create_quant_krnl_plots
17
 
18
+ MACHINE_TO_HARDWARE = {
19
+ "1xA10": "A10-24GB-150W πŸ–₯️",
20
+ "1xA100": "A100-80GB-275W πŸ–₯️",
21
+ # "1xH100": "H100-80GB-700W πŸ–₯️",
22
+ }
23
 
24
+
25
+ demo = gr.Blocks(css=custom_css)
 
 
26
  with demo:
27
  gr.HTML(LOGO, elem_classes="logo")
28
  gr.HTML(TITLE, elem_classes="title")
29
  ####################### HARDWARE TABS #######################
30
  with gr.Tabs(elem_classes="tabs"):
31
+ for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
32
+ with gr.TabItem(hardware, id=id):
33
+ ####################### CONTROL PANEL #######################
 
 
 
 
34
  (
35
  filter_button,
36
+ machine_textbox,
 
 
 
37
  score_slider,
38
  memory_slider,
39
  backend_checkboxes,
 
41
  optimization_checkboxes,
42
  quantization_checkboxes,
43
  kernels_checkboxes,
44
+ ) = create_control_panel(machine=machine)
 
 
 
 
 
 
45
  ####################### HARDWARE SUBTABS #######################
46
  with gr.Tabs(elem_classes="subtabs"):
47
+ open_llm_perf_df = get_llm_perf_df(machine=machine)
 
 
 
 
 
48
  ####################### LEADERBOARD TAB #######################
49
  with gr.TabItem("Leaderboard πŸ…", id=0):
50
  search_bar, columns_checkboxes, leaderboard_table = (
51
  create_leaderboard_table(open_llm_perf_df)
52
  )
53
+ with gr.TabItem("Find Your Best Model 🧭", id=1):
54
+ lat_score_mem_plot = create_lat_score_mem_plot(open_llm_perf_df)
 
 
 
 
 
55
  ###################### ATTENTIONS SPEEDUP TAB #######################
56
  # with gr.TabItem("Attention πŸ“ˆ", id=2):
57
  # attn_prefill_plot, attn_decode_plot = create_attn_plots(
 
67
  create_control_callback(
68
  filter_button,
69
  # inputs
70
+ machine_textbox,
 
 
 
71
  score_slider,
72
  memory_slider,
73
  backend_checkboxes,
 
89
 
90
  create_select_callback(
91
  # inputs
92
+ machine_textbox,
 
 
 
93
  # interactive
94
  columns_checkboxes,
95
  search_bar,
 
98
  )
99
 
100
  ####################### ABOUT TAB #######################
101
+ with gr.TabItem("About πŸ“–", id=3):
102
  gr.Markdown(ABOUT, elem_classes="descriptive-text")
103
  ####################### CITATION
104
  with gr.Row():
hardware.yaml DELETED
@@ -1,50 +0,0 @@
1
- - machine: 1xA10
2
- description: A10-24GB-150W πŸ–₯️
3
- hardware_provider: nvidia
4
- hardware_type: cuda
5
- subsets:
6
- - unquantized
7
- - awq
8
- - bnb
9
- - gptq
10
- backends:
11
- - pytorch
12
-
13
- - machine: 1xA100
14
- description: A100-80GB-275W πŸ–₯️
15
- hardware_provider: nvidia
16
- hardware_type: cuda
17
- subsets:
18
- - unquantized
19
- - awq
20
- - bnb
21
- - gptq
22
- - torchao
23
- backends:
24
- - pytorch
25
-
26
- - machine: 1xT4
27
- description: T4-16GB-70W πŸ–₯️
28
- hardware_provider: nvidia
29
- hardware_type: cuda
30
- subsets:
31
- - unquantized
32
- - awq
33
- - bnb
34
- - gptq
35
- - torchao
36
- backends:
37
- - pytorch
38
-
39
- - machine: 32vCPU-C7i
40
- description: Intel-Xeon-SPR-385W πŸ–₯️
41
- detail: |
42
- We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
43
- hardware_provider: intel
44
- hardware_type: cpu
45
- subsets:
46
- - unquantized
47
- backends:
48
- - pytorch
49
- - openvino
50
- - onnxruntime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  huggingface_hub
2
  transformers
3
- gradio>=5.0.0
4
  plotly
5
- pandas
6
- ruff
 
1
  huggingface_hub
2
  transformers
3
+ gradio
4
  plotly
5
+ pandas
 
src/content.py CHANGED
@@ -5,18 +5,18 @@ TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ
5
  ABOUT = """
6
  ## πŸ“ About
7
  The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ is a laderboard at the intersection of quality and performance.
8
- Its aim is to benchmark the performance (latency, throughput, memory & energy)
9
- of Large Language Models (LLMs) with different hardwares, backends and optimizations
10
  using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
11
 
12
- Anyone from the community can request a new base model or hardware/backend/optimization
13
  configuration for automated benchmarking:
14
 
15
- - Model evaluation requests should be made in the
16
  [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
17
  we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
18
- - Hardware/Backend/Optimization configuration requests should be made in the
19
- [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
20
  [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
21
 
22
  ## ✍️ Details
 
5
  ABOUT = """
6
  ## πŸ“ About
7
  The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ is a laderboard at the intersection of quality and performance.
8
+ Its aim is to benchmark the performance (latency, throughput, memory & energy)
9
+ of Large Language Models (LLMs) with different hardwares, backends and optimizations
10
  using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
11
 
12
+ Anyone from the community can request a new base model or hardware/backend/optimization
13
  configuration for automated benchmarking:
14
 
15
+ - Model evaluation requests should be made in the
16
  [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
17
  we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
18
+ - Hardware/Backend/Optimization configuration requests should be made in the
19
+ [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
20
  [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
21
 
22
  ## ✍️ Details
src/dependency.py DELETED
@@ -1,3 +0,0 @@
1
- import os
2
-
3
- os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 
 
 
 
src/hardware.py DELETED
@@ -1,26 +0,0 @@
1
- from typing import Any, Dict, List, Optional
2
-
3
- import yaml
4
-
5
-
6
- class HardwareConfig:
7
- def __init__(self, data: Dict[str, Any]):
8
- self.machine: str = data["machine"]
9
- self.description: str = data["description"]
10
- self.hardware_provider: str = data["hardware_provider"]
11
- self.hardware_type: str = data["hardware_type"]
12
- self.subsets: List[str] = data["subsets"]
13
- self.backends: List[str] = data["backends"]
14
- self.detail: Optional[str] = data.get("detail", None)
15
-
16
- def __repr__(self) -> str:
17
- return (
18
- f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
19
- f"hardware_provider={self.hardware_provider}, hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
20
- )
21
-
22
-
23
- def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
24
- with open(file_path, "r") as file:
25
- data = yaml.safe_load(file)
26
- return [HardwareConfig(config) for config in data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/kernels.py CHANGED
@@ -38,7 +38,6 @@ def get_quant_df(llm_perf_df):
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
39
  gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
40
  gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
41
- torchao_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "torchao.4bit")]
42
  # merge the three dataframes
43
  exllamav1_df = pd.merge(
44
  vanilla_df,
@@ -64,14 +63,8 @@ def get_quant_df(llm_perf_df):
64
  on=["Model πŸ€—"],
65
  suffixes=["", " Custom Kernel"],
66
  )
67
- torchao_df = pd.merge(
68
- vanilla_df,
69
- torchao_df,
70
- on=["Model πŸ€—"],
71
- suffixes=["", " Custom Kernel"],
72
- )
73
  # concat the two dataframes row-wise
74
- quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df, torchao_df])
75
  # compute speedups
76
  quant_df["Prefill Speedup (%)"] = (
77
  (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
 
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
39
  gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
40
  gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
 
41
  # merge the three dataframes
42
  exllamav1_df = pd.merge(
43
  vanilla_df,
 
63
  on=["Model πŸ€—"],
64
  suffixes=["", " Custom Kernel"],
65
  )
 
 
 
 
 
 
66
  # concat the two dataframes row-wise
67
+ quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
68
  # compute speedups
69
  quant_df["Prefill Speedup (%)"] = (
70
  (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
src/llm_perf.py CHANGED
@@ -1,12 +1,9 @@
1
  import os
2
- from typing import List
3
 
4
  import pandas as pd
5
 
6
  from .utils import process_kernels, process_quantizations
7
 
8
- DATASET_DIRECTORY = "dataset"
9
-
10
  COLUMNS_MAPPING = {
11
  "config.name": "Experiment πŸ§ͺ",
12
  "config.backend.model": "Model πŸ€—",
@@ -29,34 +26,21 @@ COLUMNS_MAPPING = {
29
  "#Params (B)": "Params (B)",
30
  }
31
  SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
 
32
  SORTING_ASCENDING = [False, True, False]
33
 
34
 
35
- def get_raw_llm_perf_df(
36
- machine: str, subsets: List[str], backends: List[str], hardware_type: str
37
- ):
38
  dfs = []
39
- for subset in subsets:
40
- for backend in backends:
41
- try:
42
- dfs.append(
43
- pd.read_csv(
44
- f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
45
- )
46
  )
47
- except Exception:
48
- print("Dataset not found for:")
49
- print(f" β€’ Backend: {backend}")
50
- print(f" β€’ Subset: {subset}")
51
- print(f" β€’ Machine: {machine}")
52
- print(f" β€’ Hardware Type: {hardware_type}")
53
- url = f"https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard/blob/main/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
54
- print(f" β€’ URL: {url}")
55
-
56
- if len(dfs) == 0:
57
- raise ValueError(
58
- f"No datasets found for machine {machine}, check your hardware.yml config file or your datatset on huggingface"
59
- )
60
 
61
  perf_df = pd.concat(dfs)
62
  llm_df = pd.read_csv(
@@ -124,22 +108,12 @@ def processed_llm_perf_df(llm_perf_df):
124
  return llm_perf_df
125
 
126
 
127
- def get_llm_perf_df(
128
- machine: str, subsets: List[str], backends: List[str], hardware_type: str
129
- ):
130
- if not os.path.exists(DATASET_DIRECTORY):
131
- os.makedirs(DATASET_DIRECTORY)
132
-
133
- if os.path.exists(f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"):
134
- llm_perf_df = pd.read_csv(
135
- f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"
136
- )
137
  else:
138
- print(f"Dataset machine {machine} not found, downloading...")
139
- llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
140
  llm_perf_df = processed_llm_perf_df(llm_perf_df)
141
- llm_perf_df.to_csv(
142
- f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
143
- )
144
 
145
  return llm_perf_df
 
1
  import os
 
2
 
3
  import pandas as pd
4
 
5
  from .utils import process_kernels, process_quantizations
6
 
 
 
7
  COLUMNS_MAPPING = {
8
  "config.name": "Experiment πŸ§ͺ",
9
  "config.backend.model": "Model πŸ€—",
 
26
  "#Params (B)": "Params (B)",
27
  }
28
  SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
29
+ SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
30
  SORTING_ASCENDING = [False, True, False]
31
 
32
 
33
+ def get_raw_llm_perf_df(machine: str = "1xA10"):
 
 
34
  dfs = []
35
+ for subset in SUBSETS:
36
+ try:
37
+ dfs.append(
38
+ pd.read_csv(
39
+ f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{subset}-{machine}.csv"
 
 
40
  )
41
+ )
42
+ except Exception:
43
+ print(f"Subset {subset} for machine {machine} not found")
 
 
 
 
 
 
 
 
 
 
44
 
45
  perf_df = pd.concat(dfs)
46
  llm_df = pd.read_csv(
 
108
  return llm_perf_df
109
 
110
 
111
+ def get_llm_perf_df(machine: str = "1xA10"):
112
+ if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
113
+ llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
 
 
 
 
 
 
 
114
  else:
115
+ llm_perf_df = get_raw_llm_perf_df(machine)
 
116
  llm_perf_df = processed_llm_perf_df(llm_perf_df)
117
+ llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
 
 
118
 
119
  return llm_perf_df
src/panel.py CHANGED
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
  import gradio as gr
4
 
5
  from src.leaderboard import get_leaderboard_df
@@ -10,38 +8,9 @@ from src.llm_perf import get_llm_perf_df
10
  from src.map import get_lat_score_mem_fig
11
 
12
 
13
- def create_control_panel(
14
- machine: str,
15
- subsets: List[str],
16
- backends: List[str],
17
- hardware_provider: str,
18
- hardware_type: str,
19
- ):
20
  # controls
21
- machine_value = gr.State(value=machine)
22
- subsets_value = gr.State(value=subsets)
23
- backends_value = gr.State(value=backends)
24
- hardware_type_value = gr.State(value=hardware_type)
25
-
26
- if hardware_provider == "nvidia":
27
- backends = ["pytorch"]
28
- attention_implementations = ["Eager", "SDPA", "FAv2"]
29
- quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
30
- kernels = [
31
- "No Kernel",
32
- "GPTQ.ExllamaV1",
33
- "GPTQ.ExllamaV2",
34
- "AWQ.GEMM",
35
- "AWQ.GEMV",
36
- ]
37
- elif hardware_provider == "intel":
38
- backends = ["pytorch", "onnxruntime", "openvino"]
39
- attention_implementations = ["Eager"]
40
- quantizations = ["Unquantized"]
41
- kernels = ["No Kernel"]
42
- else:
43
- raise ValueError(f"Unknown hardware provider: {hardware_provider}")
44
-
45
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
46
  with gr.Row():
47
  with gr.Column(scale=2, variant="panel"):
@@ -63,8 +32,8 @@ def create_control_panel(
63
  with gr.Column(scale=1, variant="panel"):
64
  backend_checkboxes = gr.CheckboxGroup(
65
  label="Backends 🏭",
66
- choices=backends,
67
- value=backends,
68
  info="β˜‘οΈ Select the backends",
69
  elem_id="backend-checkboxes",
70
  )
@@ -80,8 +49,8 @@ def create_control_panel(
80
  with gr.Column(scale=1, variant="panel"):
81
  optimization_checkboxes = gr.CheckboxGroup(
82
  label="Attentions πŸ‘οΈ",
83
- choices=attention_implementations,
84
- value=attention_implementations,
85
  info="β˜‘οΈ Select the optimization",
86
  elem_id="optimization-checkboxes",
87
  )
@@ -89,8 +58,20 @@ def create_control_panel(
89
  with gr.Column(scale=1, variant="panel"):
90
  quantization_checkboxes = gr.CheckboxGroup(
91
  label="Quantizations πŸ—œοΈ",
92
- choices=quantizations,
93
- value=quantizations,
 
 
 
 
 
 
 
 
 
 
 
 
94
  info="β˜‘οΈ Select the quantization schemes",
95
  elem_id="quantization-checkboxes",
96
  elem_classes="boxed-option",
@@ -98,8 +79,20 @@ def create_control_panel(
98
  with gr.Column(scale=1, variant="panel"):
99
  kernels_checkboxes = gr.CheckboxGroup(
100
  label="Kernels βš›οΈ",
101
- choices=kernels,
102
- value=kernels,
 
 
 
 
 
 
 
 
 
 
 
 
103
  info="β˜‘οΈ Select the custom kernels",
104
  elem_id="kernel-checkboxes",
105
  elem_classes="boxed-option",
@@ -113,10 +106,7 @@ def create_control_panel(
113
 
114
  return (
115
  filter_button,
116
- machine_value,
117
- backends_value,
118
- hardware_type_value,
119
- subsets_value,
120
  score_slider,
121
  memory_slider,
122
  backend_checkboxes,
@@ -129,13 +119,10 @@ def create_control_panel(
129
 
130
  def filter_rows_fn(
131
  machine,
132
- subsets,
133
- backends,
134
- hardware_type,
135
  # inputs
136
  score,
137
  memory,
138
- backend_checkboxes,
139
  precisions,
140
  attentions,
141
  quantizations,
@@ -144,14 +131,12 @@ def filter_rows_fn(
144
  columns,
145
  search,
146
  ):
147
- llm_perf_df = get_llm_perf_df(
148
- machine=machine, subsets=subsets, backends=backends, hardware_type=hardware_type
149
- )
150
  # print(attentions)
151
  # print(llm_perf_df["Attention πŸ‘οΈ"].unique())
152
  filtered_llm_perf_df = llm_perf_df[
153
  llm_perf_df["Model πŸ€—"].str.contains(search, case=False)
154
- & llm_perf_df["Backend 🏭"].isin(backend_checkboxes)
155
  & llm_perf_df["Precision πŸ“₯"].isin(precisions)
156
  & llm_perf_df["Attention πŸ‘οΈ"].isin(attentions)
157
  & llm_perf_df["Quantization πŸ—œοΈ"].isin(quantizations)
@@ -160,7 +145,7 @@ def filter_rows_fn(
160
  & (llm_perf_df["Memory (MB)"] <= memory)
161
  ]
162
  selected_filtered_llm_perf_df = select_columns_fn(
163
- machine, subsets, backends, hardware_type, columns, search, filtered_llm_perf_df
164
  )
165
  selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df)
166
  # filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
@@ -186,10 +171,7 @@ def create_control_callback(
186
  # button
187
  filter_button,
188
  # fixed
189
- machine_value,
190
- subsets_value,
191
- backends_value,
192
- hardware_type_value,
193
  # inputs
194
  score_slider,
195
  memory_slider,
@@ -215,10 +197,7 @@ def create_control_callback(
215
  fn=filter_rows_fn,
216
  inputs=[
217
  # fixed
218
- machine_value,
219
- subsets_value,
220
- backends_value,
221
- hardware_type_value,
222
  # inputs
223
  score_slider,
224
  memory_slider,
@@ -244,16 +223,9 @@ def create_control_callback(
244
  )
245
 
246
 
247
- def select_columns_fn(
248
- machine, subsets, backends, hardware_type, columns, search, llm_perf_df=None
249
- ):
250
  if llm_perf_df is None:
251
- llm_perf_df = get_llm_perf_df(
252
- machine=machine,
253
- subsets=subsets,
254
- backends=backends,
255
- hardware_type=hardware_type,
256
- )
257
 
258
  selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
259
  selected_leaderboard_df = selected_leaderboard_df[
@@ -266,10 +238,7 @@ def select_columns_fn(
266
 
267
  def create_select_callback(
268
  # fixed
269
- machine_value,
270
- subsets_value,
271
- backends_value,
272
- hardware_type_value,
273
  # interactive
274
  columns_checkboxes,
275
  search_bar,
@@ -278,25 +247,11 @@ def create_select_callback(
278
  ):
279
  columns_checkboxes.change(
280
  fn=select_columns_fn,
281
- inputs=[
282
- machine_value,
283
- subsets_value,
284
- backends_value,
285
- hardware_type_value,
286
- columns_checkboxes,
287
- search_bar,
288
- ],
289
  outputs=[leaderboard_table],
290
  )
291
  search_bar.change(
292
  fn=select_columns_fn,
293
- inputs=[
294
- machine_value,
295
- subsets_value,
296
- backends_value,
297
- hardware_type_value,
298
- columns_checkboxes,
299
- search_bar,
300
- ],
301
  outputs=[leaderboard_table],
302
  )
 
 
 
1
  import gradio as gr
2
 
3
  from src.leaderboard import get_leaderboard_df
 
8
  from src.map import get_lat_score_mem_fig
9
 
10
 
11
+ def create_control_panel(machine: str):
 
 
 
 
 
 
12
  # controls
13
+ machine_textbox = gr.Textbox(value=machine, visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
15
  with gr.Row():
16
  with gr.Column(scale=2, variant="panel"):
 
32
  with gr.Column(scale=1, variant="panel"):
33
  backend_checkboxes = gr.CheckboxGroup(
34
  label="Backends 🏭",
35
+ choices=["pytorch"],
36
+ value=["pytorch"],
37
  info="β˜‘οΈ Select the backends",
38
  elem_id="backend-checkboxes",
39
  )
 
49
  with gr.Column(scale=1, variant="panel"):
50
  optimization_checkboxes = gr.CheckboxGroup(
51
  label="Attentions πŸ‘οΈ",
52
+ choices=["Eager", "SDPA", "FAv2"],
53
+ value=["Eager", "SDPA", "FAv2"],
54
  info="β˜‘οΈ Select the optimization",
55
  elem_id="optimization-checkboxes",
56
  )
 
58
  with gr.Column(scale=1, variant="panel"):
59
  quantization_checkboxes = gr.CheckboxGroup(
60
  label="Quantizations πŸ—œοΈ",
61
+ choices=[
62
+ "Unquantized",
63
+ "BnB.4bit",
64
+ "BnB.8bit",
65
+ "AWQ.4bit",
66
+ "GPTQ.4bit",
67
+ ],
68
+ value=[
69
+ "Unquantized",
70
+ "BnB.4bit",
71
+ "BnB.8bit",
72
+ "AWQ.4bit",
73
+ "GPTQ.4bit",
74
+ ],
75
  info="β˜‘οΈ Select the quantization schemes",
76
  elem_id="quantization-checkboxes",
77
  elem_classes="boxed-option",
 
79
  with gr.Column(scale=1, variant="panel"):
80
  kernels_checkboxes = gr.CheckboxGroup(
81
  label="Kernels βš›οΈ",
82
+ choices=[
83
+ "No Kernel",
84
+ "GPTQ.ExllamaV1",
85
+ "GPTQ.ExllamaV2",
86
+ "AWQ.GEMM",
87
+ "AWQ.GEMV",
88
+ ],
89
+ value=[
90
+ "No Kernel",
91
+ "GPTQ.ExllamaV1",
92
+ "GPTQ.ExllamaV2",
93
+ "AWQ.GEMM",
94
+ "AWQ.GEMV",
95
+ ],
96
  info="β˜‘οΈ Select the custom kernels",
97
  elem_id="kernel-checkboxes",
98
  elem_classes="boxed-option",
 
106
 
107
  return (
108
  filter_button,
109
+ machine_textbox,
 
 
 
110
  score_slider,
111
  memory_slider,
112
  backend_checkboxes,
 
119
 
120
  def filter_rows_fn(
121
  machine,
 
 
 
122
  # inputs
123
  score,
124
  memory,
125
+ backends,
126
  precisions,
127
  attentions,
128
  quantizations,
 
131
  columns,
132
  search,
133
  ):
134
+ llm_perf_df = get_llm_perf_df(machine=machine)
 
 
135
  # print(attentions)
136
  # print(llm_perf_df["Attention πŸ‘οΈ"].unique())
137
  filtered_llm_perf_df = llm_perf_df[
138
  llm_perf_df["Model πŸ€—"].str.contains(search, case=False)
139
+ & llm_perf_df["Backend 🏭"].isin(backends)
140
  & llm_perf_df["Precision πŸ“₯"].isin(precisions)
141
  & llm_perf_df["Attention πŸ‘οΈ"].isin(attentions)
142
  & llm_perf_df["Quantization πŸ—œοΈ"].isin(quantizations)
 
145
  & (llm_perf_df["Memory (MB)"] <= memory)
146
  ]
147
  selected_filtered_llm_perf_df = select_columns_fn(
148
+ machine, columns, search, filtered_llm_perf_df
149
  )
150
  selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df)
151
  # filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
 
171
  # button
172
  filter_button,
173
  # fixed
174
+ machine_textbox,
 
 
 
175
  # inputs
176
  score_slider,
177
  memory_slider,
 
197
  fn=filter_rows_fn,
198
  inputs=[
199
  # fixed
200
+ machine_textbox,
 
 
 
201
  # inputs
202
  score_slider,
203
  memory_slider,
 
223
  )
224
 
225
 
226
+ def select_columns_fn(machine, columns, search, llm_perf_df=None):
 
 
227
  if llm_perf_df is None:
228
+ llm_perf_df = get_llm_perf_df(machine=machine)
 
 
 
 
 
229
 
230
  selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
231
  selected_leaderboard_df = selected_leaderboard_df[
 
238
 
239
  def create_select_callback(
240
  # fixed
241
+ machine_textbox,
 
 
 
242
  # interactive
243
  columns_checkboxes,
244
  search_bar,
 
247
  ):
248
  columns_checkboxes.change(
249
  fn=select_columns_fn,
250
+ inputs=[machine_textbox, columns_checkboxes, search_bar],
 
 
 
 
 
 
 
251
  outputs=[leaderboard_table],
252
  )
253
  search_bar.change(
254
  fn=select_columns_fn,
255
+ inputs=[machine_textbox, columns_checkboxes, search_bar],
 
 
 
 
 
 
 
256
  outputs=[leaderboard_table],
257
  )
src/utils.py CHANGED
@@ -70,11 +70,6 @@ def process_quantizations(x):
70
  and x["config.backend.quantization_config.bits"] == 4
71
  ):
72
  return "AWQ.4bit"
73
- elif (
74
- x["config.backend.quantization_scheme"] == "torchao"
75
- and x["config.backend.quantization_config.quant_type"] == "int4_weight_only"
76
- ):
77
- return "torchao.4bit"
78
  else:
79
  return "Unquantized"
80
 
 
70
  and x["config.backend.quantization_config.bits"] == 4
71
  ):
72
  return "AWQ.4bit"
 
 
 
 
 
73
  else:
74
  return "Unquantized"
75