File size: 5,177 Bytes
2460b35
 
 
 
 
dc685a9
2460b35
 
 
 
 
 
 
 
 
 
14d526b
2460b35
14d526b
 
2460b35
 
14d526b
2460b35
14d526b
2460b35
 
 
 
 
 
dc685a9
08604d0
14d526b
 
dc685a9
14d526b
dc685a9
14d526b
 
08604d0
 
14d526b
 
2460b35
 
14d526b
2460b35
 
14d526b
2460b35
 
14d526b
2460b35
 
14d526b
 
 
 
 
 
 
 
 
 
 
 
 
2460b35
 
dc685a9
2460b35
dc685a9
 
2460b35
dc685a9
 
2460b35
 
dc685a9
 
2460b35
dc685a9
2460b35
 
dc685a9
 
2460b35
 
dc685a9
2460b35
 
 
dc685a9
14d526b
2460b35
 
 
 
dc685a9
2460b35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
 
2460b35
 
dc685a9
2460b35
 
 
dc685a9
14d526b
2460b35
 
 
 
dc685a9
2460b35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc685a9
2460b35
 
 
dc685a9
 
2460b35
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import gradio as gr
import pandas as pd
import plotly.express as px


QUANT_DATA = [
    # open llm
    "Model πŸ€—",
    "Arch πŸ›οΈ",
    "DType πŸ“₯",
    "Backend 🏭",
    "Params (B)",
    "Open LLM Score (%)",
    # deployment settings
    "DType πŸ“₯",
    "Backend 🏭",
    "Optimization πŸ› οΈ",
    "Quantization πŸ—œοΈ",
    "Optimization πŸ› οΈ Custom Kernel",
    "Quantization πŸ—œοΈ Custom Kernel",
    # primary measurements
    "Prefill Latency (s)",
    "Prefill Latency (s) Custom Kernel",
    "Decode Throughput (tokens/s)",
    "Decode Throughput (tokens/s) Custom Kernel",
    # speedups
    "Prefill Latency Speedup (%)",
    "Decode Throughput Speedup (%)",
]


def get_quant_df(llm_perf_df):
    copy_df = llm_perf_df.copy()
    # seperate vanilla GPTQ experiments from Custom Kernel experiments
    vanilla_df = copy_df[
        (copy_df["Backend 🏭"] == "pytorch") &
        (copy_df["Quantization πŸ—œοΈ"] == "None") &
        (copy_df["Optimization πŸ› οΈ"] == "None") &
        (copy_df["DType πŸ“₯"] == "float16")
    ]
    exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
    exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
    gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
    gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
    # merge the three dataframes
    exllamav1_df = pd.merge(
        vanilla_df,
        exllamav1_df,
        on=["Model πŸ€—"],
        suffixes=["", " Custom Kernel"],
    )
    exllamav2_df = pd.merge(
        vanilla_df,
        exllamav2_df,
        on=["Model πŸ€—"],
        suffixes=["", " Custom Kernel"],
    )
    gemm_df = pd.merge(
        vanilla_df,
        gemm_df,
        on=["Model πŸ€—"],
        suffixes=["", " Custom Kernel"],
    )
    gemv_df = pd.merge(
        vanilla_df,
        gemv_df,
        on=["Model πŸ€—"],
        suffixes=["", " Custom Kernel"],
    )
    # concat the two dataframes row-wise
    quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
    # compute speedups
    quant_df["Prefill Latency Speedup (%)"] = (
        (quant_df["Prefill Latency (s)"] / quant_df["Prefill Latency (s) Custom Kernel"]) * 100
    ).round(2) - 100
    quant_df["Decode Throughput Speedup (%)"] = (
        (quant_df["Decode Throughput (tokens/s) Custom Kernel"] / quant_df["Decode Throughput (tokens/s)"]) * 100
    ).round(2) - 100
    # filter speedups > 1000%
    quant_df = quant_df[quant_df["Prefill Latency Speedup (%)"] < 1000]
    quant_df = quant_df[quant_df["Decode Throughput Speedup (%)"] < 1000]

    return quant_df


def get_quant_decode_fig(llm_perf_df):
    quant_df = get_quant_df(llm_perf_df)
    # plot
    decode_fig = px.box(
        quant_df,
        x="Arch πŸ›οΈ",
        y="Decode Throughput Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=QUANT_DATA,
        color="Quantization πŸ—œοΈ Custom Kernel",
        points="all",
    )
    # add hover data
    decode_fig.update_traces(
        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
    )
    # add layout
    decode_fig.update_layout(
        title={
            "text": "Decode Throughput Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Decode Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return decode_fig


def get_quant_prefill_fig(llm_perf_df):
    quant_df = get_quant_df(llm_perf_df)
    # plot
    prefill_fig = px.box(
        quant_df,
        x="Arch πŸ›οΈ",
        y="Prefill Latency Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=QUANT_DATA,
        color="Quantization πŸ—œοΈ Custom Kernel",
        points="all",
    )
    # add hover data
    prefill_fig.update_traces(
        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
    )
    # add layout
    prefill_fig.update_layout(
        title={
            "text": "Prefill Latency Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Prefill Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return prefill_fig


def create_quant_plots(llm_perf_df):
    # descriptive text
    gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
    # get figures
    prefill_fig = get_quant_prefill_fig(llm_perf_df)
    decode_fig = get_quant_decode_fig(llm_perf_df)

    # create plots
    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)

    return prefill_plot, decode_plot