IlyasMoutawwakil HF staff commited on
Commit
2460b35
β€’
1 Parent(s): bb5689a

added exllama v2

Browse files
Files changed (4) hide show
  1. app.py +7 -1
  2. src/bettertransformer.py +0 -1
  3. src/control_panel.py +4 -0
  4. src/exllama.py +148 -0
app.py CHANGED
@@ -5,8 +5,9 @@ import gradio as gr
5
  from src.control_panel import create_control_panel, create_control_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
7
  from src.leaderboard import create_leaderboard_table
8
- from src.flashattentionv2 import create_fa2_plots
9
  from src.bettertransformer import create_bt_plots
 
 
10
  from src.llm_perf import get_llm_perf_df
11
  from src.assets import custom_css
12
  from src.content import (
@@ -59,6 +60,9 @@ with demo:
59
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
60
  with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
61
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
 
 
 
62
  ####################### CONTROL CALLBACK #######################
63
  create_control_callback(
64
  filter_button,
@@ -78,6 +82,8 @@ with demo:
78
  bt_decode_plot,
79
  fa2_prefill_plot,
80
  fa2_decode_plot,
 
 
81
  )
82
  ####################### ABOUT TAB #######################
83
  with gr.TabItem("About πŸ“–", id=3):
 
5
  from src.control_panel import create_control_panel, create_control_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
7
  from src.leaderboard import create_leaderboard_table
 
8
  from src.bettertransformer import create_bt_plots
9
+ from src.flashattentionv2 import create_fa2_plots
10
+ from src.exllama import create_exllama_plots
11
  from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
  from src.content import (
 
60
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
61
  with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
62
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
63
+ with gr.TabItem("Exllama Speedup πŸ“ˆ", id=4):
64
+ exllama_prefill_plot, exllama_decode_plot = create_exllama_plots(llm_perf_df)
65
+
66
  ####################### CONTROL CALLBACK #######################
67
  create_control_callback(
68
  filter_button,
 
82
  bt_decode_plot,
83
  fa2_prefill_plot,
84
  fa2_decode_plot,
85
+ exllama_prefill_plot,
86
+ exllama_decode_plot,
87
  )
88
  ####################### ABOUT TAB #######################
89
  with gr.TabItem("About πŸ“–", id=3):
src/bettertransformer.py CHANGED
@@ -47,7 +47,6 @@ def get_bt_df(llm_perf_df):
47
  bt_df["Decode Throughput Speedup (%)"] = (
48
  (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
49
  ).round(2) - 100
50
-
51
  # filter speedups > 1000%
52
  bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
53
  bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
 
47
  bt_df["Decode Throughput Speedup (%)"] = (
48
  (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
49
  ).round(2) - 100
 
50
  # filter speedups > 1000%
51
  bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
52
  bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
src/control_panel.py CHANGED
@@ -144,6 +144,8 @@ def create_control_callback(
144
  bt_decode_plot,
145
  fa2_prefill_plot,
146
  fa2_decode_plot,
 
 
147
  ):
148
  filter_button.click(
149
  fn=filter_fn,
@@ -164,5 +166,7 @@ def create_control_callback(
164
  bt_decode_plot,
165
  fa2_prefill_plot,
166
  fa2_decode_plot,
 
 
167
  ],
168
  )
 
144
  bt_decode_plot,
145
  fa2_prefill_plot,
146
  fa2_decode_plot,
147
+ exllama_prefill_plot,
148
+ exllama_decode_plot,
149
  ):
150
  filter_button.click(
151
  fn=filter_fn,
 
166
  bt_decode_plot,
167
  fa2_prefill_plot,
168
  fa2_decode_plot,
169
+ exllama_prefill_plot,
170
+ exllama_decode_plot,
171
  ],
172
  )
src/exllama.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+
6
+ EXLLAMA_DATA = [
7
+ # open llm
8
+ "Model πŸ€—",
9
+ "Arch πŸ›οΈ",
10
+ "DType πŸ“₯",
11
+ "Backend 🏭",
12
+ "Params (B)",
13
+ "Open LLM Score (%)",
14
+ # deployment settings
15
+ "DType πŸ“₯",
16
+ "Backend 🏭",
17
+ "Quantization πŸ—œοΈ",
18
+ # primary measurements
19
+ "Prefill Latency (s)",
20
+ "Prefill Latency (s) Exllama",
21
+ "Decode Throughput (tokens/s)",
22
+ "Decode Throughput (tokens/s) Exllama",
23
+ "E2E Throughput (tokens/s)",
24
+ "E2E Throughput (tokens/s) Exllama",
25
+ # speedups
26
+ "Prefill Latency Speedup (%)",
27
+ "Decode Throughput Speedup (%)",
28
+ ]
29
+
30
+
31
+ def get_exllama_df(llm_perf_df):
32
+ exllama_df = llm_perf_df.copy()
33
+ # seperate original model experiments from Exllama experiments
34
+ gptq_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit")]
35
+ exllamav1_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
36
+ exllamav2_df = exllama_df[(exllama_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
37
+ # merge the three dataframes
38
+ exllamav1_df = pd.merge(
39
+ gptq_df,
40
+ exllamav1_df,
41
+ on=["Model πŸ€—"],
42
+ suffixes=["", " Exllama"],
43
+ )
44
+ exllamav2_df = pd.merge(
45
+ gptq_df,
46
+ exllamav2_df,
47
+ on=["Model πŸ€—"],
48
+ suffixes=["", " Exllama"],
49
+ )
50
+ # concat the two dataframes row-wise
51
+ exllama_df = pd.concat([exllamav1_df, exllamav2_df])
52
+ exllama_df["Quantization πŸ—œοΈ"] = exllama_df["Quantization πŸ—œοΈ Exllama"]
53
+ # compute speedups
54
+ exllama_df["Prefill Latency Speedup (%)"] = (
55
+ (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
56
+ ).round(2) - 100
57
+ exllama_df["Decode Throughput Speedup (%)"] = (
58
+ (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
59
+ ).round(2) - 100
60
+ # filter speedups > 1000%
61
+ exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
62
+ exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]
63
+
64
+ return exllama_df
65
+
66
+
67
+ def get_exllama_decode_fig(llm_perf_df):
68
+ exllama_df = get_exllama_df(llm_perf_df)
69
+ # plot
70
+ decode_fig = px.box(
71
+ exllama_df,
72
+ x="Arch πŸ›οΈ",
73
+ y="Decode Throughput Speedup (%)",
74
+ color_discrete_sequence=px.colors.qualitative.Light24,
75
+ custom_data=EXLLAMA_DATA,
76
+ color="Quantization πŸ—œοΈ Exllama",
77
+ points="all",
78
+ )
79
+ # add hover data
80
+ decode_fig.update_traces(
81
+ hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
82
+ )
83
+ # add layout
84
+ decode_fig.update_layout(
85
+ title={
86
+ "text": "Decode Throughput Speedup per Architecture",
87
+ "y": 0.95,
88
+ "x": 0.5,
89
+ "xanchor": "center",
90
+ "yanchor": "top",
91
+ },
92
+ xaxis_title="LLM Architecture",
93
+ yaxis_title="Decode Speedup (%)",
94
+ legend_title="Quantization Scheme",
95
+ width=1200,
96
+ height=600,
97
+ )
98
+
99
+ return decode_fig
100
+
101
+
102
+ def get_exllama_prefill_fig(llm_perf_df):
103
+ exllama_df = get_exllama_df(llm_perf_df)
104
+ # plot
105
+ prefill_fig = px.box(
106
+ exllama_df,
107
+ x="Arch πŸ›οΈ",
108
+ y="Prefill Latency Speedup (%)",
109
+ color_discrete_sequence=px.colors.qualitative.Light24,
110
+ custom_data=EXLLAMA_DATA,
111
+ color="Quantization πŸ—œοΈ Exllama",
112
+ points="all",
113
+ )
114
+ # add hover data
115
+ prefill_fig.update_traces(
116
+ hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
117
+ )
118
+ # add layout
119
+ prefill_fig.update_layout(
120
+ title={
121
+ "text": "Prefill Latency Speedup per Architecture",
122
+ "y": 0.95,
123
+ "x": 0.5,
124
+ "xanchor": "center",
125
+ "yanchor": "top",
126
+ },
127
+ xaxis_title="LLM Architecture",
128
+ yaxis_title="Prefill Speedup (%)",
129
+ legend_title="Quantization Scheme",
130
+ width=1200,
131
+ height=600,
132
+ )
133
+
134
+ return prefill_fig
135
+
136
+
137
+ def create_exllama_plots(llm_perf_df):
138
+ # descriptive text
139
+ gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
140
+ # get figures
141
+ prefill_fig = get_exllama_prefill_fig(llm_perf_df)
142
+ decode_fig = get_exllama_decode_fig(llm_perf_df)
143
+
144
+ # create plots
145
+ prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
146
+ decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
147
+
148
+ return prefill_plot, decode_plot