IlyasMoutawwakil HF staff commited on
Commit
a8a6326
β€’
1 Parent(s): 3a67001
Files changed (7) hide show
  1. README.md +1 -1
  2. app.py +13 -7
  3. src/assets.py +31 -5
  4. src/content.py +28 -79
  5. src/control_panel.py +108 -81
  6. src/leaderboard.py +20 -13
  7. src/llm_perf.py +13 -8
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ†πŸ‹οΈ
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
 
3
  import gradio as gr
4
 
5
- from src.control_panel import create_control_panel, create_control_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
7
  from src.quantization_kernels import create_quant_plots
8
  from src.leaderboard import create_leaderboard_table
@@ -14,8 +14,6 @@ from src.content import (
14
  LOGO,
15
  TITLE,
16
  ABOUT,
17
- INTRODUCTION,
18
- EXAMPLE_CONFIG,
19
  CITATION_BUTTON,
20
  CITATION_BUTTON_LABEL,
21
  )
@@ -29,7 +27,6 @@ demo = gr.Blocks(css=custom_css)
29
  with demo:
30
  gr.HTML(LOGO, elem_classes="logo")
31
  gr.HTML(TITLE, elem_classes="title")
32
- gr.Markdown(INTRODUCTION, elem_classes="descriptive-text")
33
  ####################### HARDWARE TABS #######################
34
  with gr.Tabs(elem_classes="tabs"):
35
  for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
@@ -51,7 +48,7 @@ with demo:
51
  llm_perf_df = get_llm_perf_df(machine=machine)
52
  ####################### LEADERBOARD TAB #######################
53
  with gr.TabItem("Leaderboard πŸ…", id=0):
54
- leaderboard_table = create_leaderboard_table(llm_perf_df)
55
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
56
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
57
  with gr.TabItem("BetterTransformer πŸ“ˆ", id=2):
@@ -73,6 +70,7 @@ with demo:
73
  datatype_checkboxes,
74
  optimization_checkboxes,
75
  quantization_checkboxes,
 
76
  # outputs
77
  leaderboard_table,
78
  lat_score_mem_plot,
@@ -83,10 +81,18 @@ with demo:
83
  quant_prefill_plot,
84
  quant_decode_plot,
85
  )
 
 
 
 
 
 
 
 
 
86
  ####################### ABOUT TAB #######################
87
  with gr.TabItem("About πŸ“–", id=3):
88
- gr.HTML(ABOUT, elem_classes="descriptive-text")
89
- gr.Markdown(EXAMPLE_CONFIG, elem_classes="descriptive-text")
90
  ####################### CITATION
91
  with gr.Row():
92
  with gr.Accordion("πŸ“™ Citation", open=False):
 
2
 
3
  import gradio as gr
4
 
5
+ from src.control_panel import create_control_panel, create_control_callback, create_select_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
7
  from src.quantization_kernels import create_quant_plots
8
  from src.leaderboard import create_leaderboard_table
 
14
  LOGO,
15
  TITLE,
16
  ABOUT,
 
 
17
  CITATION_BUTTON,
18
  CITATION_BUTTON_LABEL,
19
  )
 
27
  with demo:
28
  gr.HTML(LOGO, elem_classes="logo")
29
  gr.HTML(TITLE, elem_classes="title")
 
30
  ####################### HARDWARE TABS #######################
31
  with gr.Tabs(elem_classes="tabs"):
32
  for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
 
48
  llm_perf_df = get_llm_perf_df(machine=machine)
49
  ####################### LEADERBOARD TAB #######################
50
  with gr.TabItem("Leaderboard πŸ…", id=0):
51
+ leaderboard_table, columns_checkboxes = create_leaderboard_table(llm_perf_df)
52
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
53
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
54
  with gr.TabItem("BetterTransformer πŸ“ˆ", id=2):
 
70
  datatype_checkboxes,
71
  optimization_checkboxes,
72
  quantization_checkboxes,
73
+ columns_checkboxes,
74
  # outputs
75
  leaderboard_table,
76
  lat_score_mem_plot,
 
81
  quant_prefill_plot,
82
  quant_decode_plot,
83
  )
84
+
85
+ create_select_callback(
86
+ # inputs
87
+ machine_textbox,
88
+ columns_checkboxes,
89
+ # outputs
90
+ leaderboard_table,
91
+ )
92
+
93
  ####################### ABOUT TAB #######################
94
  with gr.TabItem("About πŸ“–", id=3):
95
+ gr.Markdown(ABOUT, elem_classes="descriptive-text")
 
96
  ####################### CITATION
97
  with gr.Row():
98
  with gr.Accordion("πŸ“™ Citation", open=False):
src/assets.py CHANGED
@@ -17,16 +17,42 @@ custom_css = """
17
  font-size: 20px;
18
  }
19
 
20
- #citation-button span {
21
  font-size: 16px !important;
22
  }
23
 
24
- #citation-button textarea {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  font-size: 16px !important;
26
  }
27
 
28
- #citation-button > label > button {
29
- margin: 6px;
30
- transform: scale(1.3);
 
 
31
  }
32
  """
 
17
  font-size: 20px;
18
  }
19
 
20
+ .descriptive-text span {
21
  font-size: 16px !important;
22
  }
23
 
24
+ #control-panel span {
25
+ font-size: 20px !important;
26
+ }
27
+ #search-bar span {
28
+ font-size: 16px !important;
29
+ }
30
+ #threshold-slider span {
31
+ font-size: 16px !important;
32
+ }
33
+ #memory-slider span {
34
+ font-size: 16px !important;
35
+ }
36
+ #columns-checkboxes span {
37
+ font-size: 16px !important;
38
+ }
39
+ #backend-checkboxes span {
40
+ font-size: 16px !important;
41
+ }
42
+ #dtype-checkboxes span {
43
+ font-size: 16px !important;
44
+ }
45
+ #optimization-checkboxes span {
46
+ font-size: 16px !important;
47
+ }
48
+ #quantization-checkboxes span {
49
  font-size: 16px !important;
50
  }
51
 
52
+ #leaderboard-table td:first-child,
53
+ #leaderboard-table th:first-child {
54
+ max-width: 300px;
55
+ overflow: auto;
56
+ white-space: nowrap;
57
  }
58
  """
src/content.py CHANGED
@@ -2,85 +2,34 @@ LOGO = '<img src="https://raw.githubusercontent.com/huggingface/optimum-benchmar
2
 
3
  TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
4
 
5
- INTRODUCTION = """
6
- The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
7
-
8
- Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
9
- - Model evaluation requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
10
- - Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
11
- """
12
-
13
- ABOUT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
14
- <ul>
15
- <li>To avoid communication-dependent results, only one GPU is used.</li>
16
- <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
17
- <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.</li>
18
- <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
19
- <li>We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.</li>
20
- </ul>
21
- """
22
-
23
- EXAMPLE_CONFIG = """
24
- Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
25
- ```yaml
26
- defaults:
27
- - backend: pytorch
28
- - _base_ # inheriting from base config
29
- - _self_ # for hydra 1.1 compatibility
30
-
31
- experiment_name: pytorch+cuda+float16+gptq-4bit+exllama-v1
32
- device: cuda
33
-
34
- backend:
35
- no_weights: true
36
- torch_dtype: float16
37
- quantization_scheme: gptq
38
- quantization_config:
39
- bits: 4
40
- use_cuda_fp16: false
41
- use_exllama: true
42
- exllama_config:
43
- version: 1
44
- ```
45
-
46
- Where the base config is:
47
- ```yaml
48
- defaults:
49
- - benchmark: inference # default benchmark
50
- - launcher: process # isolated process launcher
51
- - experiment # inheriting from experiment config
52
- - _self_ # for hydra 1.1 compatibility
53
- - override hydra/job_logging: colorlog # colorful logging
54
- - override hydra/hydra_logging: colorlog # colorful logging
55
-
56
- hydra:
57
- run:
58
- dir: dataset/${oc.env:HOSTNAME}/${experiment_name}/${model}
59
- job:
60
- chdir: true
61
- env_set:
62
- COUNTRY_ISO_CODE: FRA
63
- OVERRIDE_BENCHMARKS: 0
64
- CUDA_VISIBLE_DEVICES: 0
65
- CUDA_DEVICE_ORDER: PCI_BUS_ID
66
-
67
- backend:
68
- continuous_isolation: true
69
-
70
- benchmark:
71
- duration: 10
72
- memory: true
73
- energy: true
74
-
75
- input_shapes:
76
- batch_size: 1
77
- sequence_length: 256
78
-
79
- new_tokens: 256
80
-
81
- hub_kwargs:
82
- trust_remote_code: true
83
- ```
84
  """
85
 
86
 
 
2
 
3
  TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
4
 
5
+ ABOUT = """
6
+ ## πŸ“ About
7
+ The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ is a laderboard at the intersection of quality and performance.
8
+ Its aim is to benchmark the performance (latency, throughput, memory & energy)
9
+ of Large Language Models (LLMs) with different hardwares, backends and optimizations
10
+ using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).
11
+
12
+ Anyone from the community can request a new base model or hardware/backend/optimization
13
+ configuration for automated benchmarking:
14
+
15
+ - Model evaluation requests should be made in the
16
+ [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
17
+ we scrape the list of pretrained base models from there.
18
+ - Hardware/Backend/Optimization configuration requests should be made in the
19
+ [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
20
+ [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
21
+
22
+ ## ✍️ Details
23
+
24
+ - To avoid communication-dependent results, only one GPU is used.
25
+ - Score is the average evaluation score obtained from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
26
+ - LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.
27
+ - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
28
+ - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
29
+
30
+ All of our benchmarks are ran by this single script
31
+ [benchmark_cuda_pytorch.py](https://github.com/huggingface/optimum-benchmark/blob/llm-perf/llm-perf/benchmark_cuda_pytorch.py)
32
+ using the power of [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) to garantee reproducibility and consistency.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  """
34
 
35
 
src/control_panel.py CHANGED
@@ -9,90 +9,91 @@ from src.quantization_kernels import get_quant_prefill_fig, get_quant_decode_fig
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
12
- # descriptive text
13
- gr.HTML("Use this control panel to filter the leaderboard.", elem_id="text")
14
  # controls
15
  machine_textbox = gr.Textbox(value=machine, visible=False)
16
- with gr.Row():
17
- with gr.Column():
18
- search_bar = gr.Textbox(
19
- label="Model πŸ€—",
20
- info="πŸ” Search for a model name",
21
- elem_id="search-bar",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
23
- with gr.Row():
24
- with gr.Column(scale=1, variant="panel"):
25
- score_slider = gr.Slider(
26
- label="Open LLM Score (%) πŸ“ˆ",
27
- info="🎚️ Slide to minimum Open LLM score",
28
- value=0,
29
- elem_id="threshold-slider",
30
- )
31
- with gr.Column(scale=1, variant="panel"):
32
- memory_slider = gr.Slider(
33
- label="Peak Memory (MB) πŸ“ˆ",
34
- info="🎚️ Slide to maximum Peak Memory",
35
- minimum=0,
36
- maximum=80 * 1024,
37
- value=80 * 1024,
38
- elem_id="memory-slider",
39
- )
40
- with gr.Column(scale=1):
41
- backend_checkboxes = gr.CheckboxGroup(
42
- label="Backends 🏭",
43
- choices=["pytorch"],
44
- value=["pytorch"],
45
- info="β˜‘οΈ Select the backends",
46
- elem_id="backend-checkboxes",
47
- )
48
- with gr.Row():
49
- with gr.Column(scale=1, variant="panel"):
50
- datatype_checkboxes = gr.CheckboxGroup(
51
- label="Load DTypes πŸ“₯",
52
- choices=["float32", "float16", "bfloat16"],
53
- value=["float32", "float16", "bfloat16"],
54
- info="β˜‘οΈ Select the load data types",
55
- elem_id="dtype-checkboxes",
56
- )
57
- with gr.Column(scale=1, variant="panel"):
58
- optimization_checkboxes = gr.CheckboxGroup(
59
- label="Optimizations πŸ› οΈ",
60
- choices=["None", "BetterTransformer", "FlashAttentionV2"],
61
- value=["None", "BetterTransformer", "FlashAttentionV2"],
62
- info="β˜‘οΈ Select the optimization",
63
- elem_id="optimization-checkboxes",
64
- )
65
- with gr.Column(scale=2):
66
- quantization_checkboxes = gr.CheckboxGroup(
67
- label="Quantizations πŸ—œοΈ",
68
- choices=[
69
- "None",
70
- "BnB.4bit",
71
- "BnB.8bit",
72
- "GPTQ.4bit",
73
- "GPTQ.4bit+ExllamaV1",
74
- "GPTQ.4bit+ExllamaV2",
75
- "AWQ.4bit+GEMM",
76
- "AWQ.4bit+GEMV",
77
- ],
78
- value=[
79
- "None",
80
- "BnB.4bit",
81
- "BnB.8bit",
82
- "GPTQ.4bit",
83
- "GPTQ.4bit+ExllamaV1",
84
- "GPTQ.4bit+ExllamaV2",
85
- "AWQ.4bit+GEMM",
86
- "AWQ.4bit+GEMV",
87
- ],
88
- info="β˜‘οΈ Select the quantization schemes",
89
- elem_id="quantization-checkboxes",
90
- )
91
- with gr.Row():
92
- filter_button = gr.Button(
93
- value="Filter πŸš€",
94
- elem_id="filter-button",
95
- )
96
 
97
  return (
98
  filter_button,
@@ -114,6 +115,7 @@ def filter_fn(
114
  datatypes,
115
  optimizations,
116
  quantizations,
 
117
  score,
118
  memory,
119
  ):
@@ -128,6 +130,7 @@ def filter_fn(
128
  & (raw_df["Allocated Memory (MB)"] <= memory)
129
  ]
130
  filtered_leaderboard_df = get_leaderboard_df(filtered_df)
 
131
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
132
  filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
133
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
@@ -160,6 +163,7 @@ def create_control_callback(
160
  datatype_checkboxes,
161
  optimization_checkboxes,
162
  quantization_checkboxes,
 
163
  # outputs
164
  leaderboard_table,
165
  lat_score_mem_plot,
@@ -179,6 +183,7 @@ def create_control_callback(
179
  datatype_checkboxes,
180
  optimization_checkboxes,
181
  quantization_checkboxes,
 
182
  score_slider,
183
  memory_slider,
184
  ],
@@ -193,3 +198,25 @@ def create_control_callback(
193
  quant_decode_plot,
194
  ],
195
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
 
 
12
  # controls
13
  machine_textbox = gr.Textbox(value=machine, visible=False)
14
+ with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
15
+ with gr.Row():
16
+ with gr.Column():
17
+ search_bar = gr.Textbox(
18
+ label="Model πŸ€—",
19
+ info="πŸ” Search for a model name",
20
+ elem_id="search-bar",
21
+ )
22
+ with gr.Row():
23
+ with gr.Column(scale=1, variant="panel"):
24
+ score_slider = gr.Slider(
25
+ label="Open LLM Score (%) πŸ“ˆ",
26
+ info="🎚️ Slide to minimum Open LLM score",
27
+ value=0,
28
+ elem_id="threshold-slider",
29
+ )
30
+ with gr.Column(scale=1, variant="panel"):
31
+ memory_slider = gr.Slider(
32
+ label="Peak Memory (MB) πŸ“ˆ",
33
+ info="🎚️ Slide to maximum Peak Memory",
34
+ minimum=0,
35
+ maximum=80 * 1024,
36
+ value=80 * 1024,
37
+ elem_id="memory-slider",
38
+ )
39
+ with gr.Column(scale=1):
40
+ backend_checkboxes = gr.CheckboxGroup(
41
+ label="Backends 🏭",
42
+ choices=["pytorch"],
43
+ value=["pytorch"],
44
+ info="β˜‘οΈ Select the backends",
45
+ elem_id="backend-checkboxes",
46
+ )
47
+ with gr.Row():
48
+ with gr.Column(scale=1, variant="panel"):
49
+ datatype_checkboxes = gr.CheckboxGroup(
50
+ label="Load DTypes πŸ“₯",
51
+ choices=["float32", "float16", "bfloat16"],
52
+ value=["float32", "float16", "bfloat16"],
53
+ info="β˜‘οΈ Select the load data types",
54
+ elem_id="dtype-checkboxes",
55
+ )
56
+ with gr.Column(scale=1, variant="panel"):
57
+ optimization_checkboxes = gr.CheckboxGroup(
58
+ label="Optimizations πŸ› οΈ",
59
+ choices=["None", "BetterTransformer", "FlashAttentionV2"],
60
+ value=["None", "BetterTransformer", "FlashAttentionV2"],
61
+ info="β˜‘οΈ Select the optimization",
62
+ elem_id="optimization-checkboxes",
63
+ )
64
+ with gr.Column(scale=2):
65
+ quantization_checkboxes = gr.CheckboxGroup(
66
+ label="Quantizations πŸ—œοΈ",
67
+ choices=[
68
+ "None",
69
+ "BnB.4bit",
70
+ "BnB.8bit",
71
+ "GPTQ.4bit",
72
+ "GPTQ.4bit+ExllamaV1",
73
+ "GPTQ.4bit+ExllamaV2",
74
+ "AWQ.4bit+GEMM",
75
+ "AWQ.4bit+GEMV",
76
+ ],
77
+ value=[
78
+ "None",
79
+ "BnB.4bit",
80
+ "BnB.8bit",
81
+ "GPTQ.4bit",
82
+ "GPTQ.4bit+ExllamaV1",
83
+ "GPTQ.4bit+ExllamaV2",
84
+ "AWQ.4bit+GEMM",
85
+ "AWQ.4bit+GEMV",
86
+ ],
87
+ info="β˜‘οΈ Select the quantization schemes",
88
+ elem_id="quantization-checkboxes",
89
+ elem_classes="boxed-option",
90
+ )
91
+ with gr.Row():
92
+ filter_button = gr.Button(
93
+ value="Filter πŸš€",
94
+ elem_id="filter-button",
95
+ elem_classes="boxed-option",
96
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  return (
99
  filter_button,
 
115
  datatypes,
116
  optimizations,
117
  quantizations,
118
+ columns,
119
  score,
120
  memory,
121
  ):
 
130
  & (raw_df["Allocated Memory (MB)"] <= memory)
131
  ]
132
  filtered_leaderboard_df = get_leaderboard_df(filtered_df)
133
+ filtered_leaderboard_df = filtered_leaderboard_df[columns]
134
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
135
  filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
136
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
 
163
  datatype_checkboxes,
164
  optimization_checkboxes,
165
  quantization_checkboxes,
166
+ columns_checkboxes,
167
  # outputs
168
  leaderboard_table,
169
  lat_score_mem_plot,
 
183
  datatype_checkboxes,
184
  optimization_checkboxes,
185
  quantization_checkboxes,
186
+ columns_checkboxes,
187
  score_slider,
188
  memory_slider,
189
  ],
 
198
  quant_decode_plot,
199
  ],
200
  )
201
+
202
+
203
+ def select_fn(machine, columns):
204
+ raw_df = get_llm_perf_df(machine=machine)
205
+ selected_leaderboard_df = get_leaderboard_df(raw_df)
206
+ selected_leaderboard_df = selected_leaderboard_df[columns]
207
+
208
+ return selected_leaderboard_df
209
+
210
+
211
+ def create_select_callback(
212
+ # inputs
213
+ machine_textbox,
214
+ columns_checkboxes,
215
+ # outputs
216
+ leaderboard_table,
217
+ ):
218
+ columns_checkboxes.change(
219
+ fn=select_fn,
220
+ inputs=[machine_textbox, columns_checkboxes],
221
+ outputs=[leaderboard_table],
222
+ )
src/leaderboard.py CHANGED
@@ -5,21 +5,22 @@ from src.utils import model_hyperlink, process_score
5
 
6
  LEADERBOARD_COLUMN_TO_DATATYPE = {
7
  # open llm
8
- "Model πŸ€—" :"markdown",
9
- "Arch πŸ›οΈ" :"markdown",
10
- "Params (B)": "number",
11
- "Open LLM Score (%)": "number",
12
- # deployment settings
13
- "DType πŸ“₯" :"str",
14
- "Backend 🏭" :"str",
15
- "Optimization πŸ› οΈ" :"str",
16
- "Quantization πŸ—œοΈ" :"str",
17
  # primary measurements
18
  "Prefill Latency (s)": "number",
19
  "Decode Throughput (tokens/s)": "number",
20
  "Allocated Memory (MB)": "number",
21
  "Energy (tokens/kWh)": "number",
 
 
 
 
 
22
  # additional measurements
 
 
 
23
  "E2E Latency (s)": "number",
24
  "E2E Throughput (tokens/s)": "number",
25
  "Reserved Memory (MB)": "number",
@@ -45,16 +46,22 @@ def get_leaderboard_df(llm_perf_df):
45
 
46
 
47
  def create_leaderboard_table(llm_perf_df):
48
- # descriptive text
49
- gr.HTML("πŸ‘‰ Scroll to the right πŸ‘‰ for additional columns.", elem_id="text")
50
  # get dataframe
51
  leaderboard_df = get_leaderboard_df(llm_perf_df)
 
 
 
 
 
 
 
 
52
  # create table
53
  leaderboard_table = gr.components.Dataframe(
54
  value=leaderboard_df,
55
  datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
56
  headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
57
- elem_id="table",
58
  )
59
 
60
- return leaderboard_table
 
5
 
6
  LEADERBOARD_COLUMN_TO_DATATYPE = {
7
  # open llm
8
+ "Model πŸ€—": "markdown",
9
+ "Experiment πŸ§ͺ": "str",
 
 
 
 
 
 
 
10
  # primary measurements
11
  "Prefill Latency (s)": "number",
12
  "Decode Throughput (tokens/s)": "number",
13
  "Allocated Memory (MB)": "number",
14
  "Energy (tokens/kWh)": "number",
15
+ # deployment settings
16
+ "DType πŸ“₯": "str",
17
+ "Backend 🏭": "str",
18
+ "Optimization πŸ› οΈ": "str",
19
+ "Quantization πŸ—œοΈ": "str",
20
  # additional measurements
21
+ "Arch πŸ›οΈ": "markdown",
22
+ "Params (B)": "number",
23
+ "Open LLM Score (%)": "number",
24
  "E2E Latency (s)": "number",
25
  "E2E Throughput (tokens/s)": "number",
26
  "Reserved Memory (MB)": "number",
 
46
 
47
 
48
  def create_leaderboard_table(llm_perf_df):
 
 
49
  # get dataframe
50
  leaderboard_df = get_leaderboard_df(llm_perf_df)
51
+ # create checkboxes
52
+ columns_checkboxes = gr.CheckboxGroup(
53
+ label="Columns πŸ“Š",
54
+ choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
55
+ value=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
56
+ info="β˜‘οΈ Select the columns to display",
57
+ elem_id="columns-checkboxes",
58
+ )
59
  # create table
60
  leaderboard_table = gr.components.Dataframe(
61
  value=leaderboard_df,
62
  datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
63
  headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
64
+ elem_id="leaderboard-table",
65
  )
66
 
67
+ return leaderboard_table, columns_checkboxes
src/llm_perf.py CHANGED
@@ -10,8 +10,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
 
11
  COLUMNS_MAPPING = {
12
  "Model": "Model πŸ€—",
13
- "Arch": "Arch πŸ›οΈ",
14
- "Size": "Params (B)",
15
  # primary measurements
16
  "forward.latency(s)": "Prefill Latency (s)",
17
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
@@ -23,11 +22,13 @@ COLUMNS_MAPPING = {
23
  "optimization": "Optimization πŸ› οΈ",
24
  "quantization": "Quantization πŸ—œοΈ",
25
  # additional measurements
 
 
26
  "Score": "Open LLM Score (%)",
27
  "generate.latency(s)": "E2E Latency (s)",
28
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
29
- # "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
30
- # "generate.max_memory_used(MB)": "Used Memory (MB)",
31
  }
32
  SORTING_COLUMNS = [
33
  "Open LLM Score (%)",
@@ -46,7 +47,7 @@ def get_llm_df():
46
  repo_type="dataset",
47
  token=HF_TOKEN,
48
  )
49
-
50
  llm_df = pd.read_csv("dataset/open-llm.csv")
51
 
52
  return llm_df
@@ -86,9 +87,11 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
86
 
87
  # add optimization column
88
  llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
89
- lambda x: "BetterTransformer"
90
- if x["backend.to_bettertransformer"]
91
- else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
 
 
92
  axis=1,
93
  )
94
  # add quantization scheme
@@ -102,6 +105,8 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
102
  "backend.quantization_config.exllama_config.version",
103
  ]
104
  ].apply(lambda x: process_quantization_scheme(x), axis=1)
 
 
105
  # add arch
106
  llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
107
  # filter columns
 
10
 
11
  COLUMNS_MAPPING = {
12
  "Model": "Model πŸ€—",
13
+ "experiment_name": "Experiment πŸ§ͺ",
 
14
  # primary measurements
15
  "forward.latency(s)": "Prefill Latency (s)",
16
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
 
22
  "optimization": "Optimization πŸ› οΈ",
23
  "quantization": "Quantization πŸ—œοΈ",
24
  # additional measurements
25
+ "Arch": "Arch πŸ›οΈ",
26
+ "Size": "Params (B)",
27
  "Score": "Open LLM Score (%)",
28
  "generate.latency(s)": "E2E Latency (s)",
29
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
30
+ "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
31
+ "generate.max_memory_used(MB)": "Used Memory (MB)",
32
  }
33
  SORTING_COLUMNS = [
34
  "Open LLM Score (%)",
 
47
  repo_type="dataset",
48
  token=HF_TOKEN,
49
  )
50
+
51
  llm_df = pd.read_csv("dataset/open-llm.csv")
52
 
53
  return llm_df
 
87
 
88
  # add optimization column
89
  llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
90
+ lambda x: (
91
+ "BetterTransformer"
92
+ if x["backend.to_bettertransformer"]
93
+ else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None")
94
+ ),
95
  axis=1,
96
  )
97
  # add quantization scheme
 
105
  "backend.quantization_config.exllama_config.version",
106
  ]
107
  ].apply(lambda x: process_quantization_scheme(x), axis=1)
108
+ # process experiment name
109
+ llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
110
  # add arch
111
  llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
112
  # filter columns