Spaces:
Running
Running
File size: 4,270 Bytes
ab5f5f1 a1135a9 ab5f5f1 a1135a9 ab5f5f1 a1135a9 ab5f5f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import pandas as pd
import plotly.express as px
BETTERTRANSFORMER_DATA = [
# open llm
"Model π€",
"Arch ποΈ",
"DType π₯",
"Backend π",
"Params (B)",
"Open LLM Score (%)",
# deployment settings
"DType π₯",
"Backend π",
"Quantization ποΈ",
# primary measurements
"Prefill Latency (s)",
"Prefill Latency (s) BetterTransformer",
"Decode Throughput (tokens/s)",
"Decode Throughput (tokens/s) BetterTransformer",
"E2E Throughput (tokens/s)",
"E2E Throughput (tokens/s) BetterTransformer",
# speedups
"Prefill Latency Speedup (%)",
"Decode Throughput Speedup (%)",
]
def get_bt_df(llm_perf_df):
bt_df = llm_perf_df.copy()
# seperate original model experiments from BetterTransformer experiments
original_df = bt_df[(bt_df["Optimization π οΈ"] == "None") & (bt_df["DType π₯"] == "float16")]
bt_df = bt_df[bt_df["Optimization π οΈ"] == "BetterTransformer"]
# merge the two dataframes
bt_df = pd.merge(
original_df,
bt_df,
on=["Model π€", "Quantization ποΈ"],
suffixes=["", " BetterTransformer"],
)
# compute speedups
bt_df["Prefill Latency Speedup (%)"] = (
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
).round(2) - 100
bt_df["Decode Throughput Speedup (%)"] = (
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
).round(2) - 100
# filter speedups > 1000%
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
return bt_df
def get_bt_decode_fig(llm_perf_df):
bt_df = get_bt_df(llm_perf_df)
# plot
decode_fig = px.box(
bt_df,
x="Arch ποΈ",
y="Decode Throughput Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=BETTERTRANSFORMER_DATA,
color="Quantization ποΈ",
points="all",
)
# add hover data
decode_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
)
)
# add layout
decode_fig.update_layout(
title={
"text": "Decode Throughput Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Decode Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return decode_fig
def get_bt_prefill_fig(llm_perf_df):
bt_df = get_bt_df(llm_perf_df)
# plot
prefill_fig = px.box(
bt_df,
x="Arch ποΈ",
y="Prefill Latency Speedup (%)",
color_discrete_sequence=px.colors.qualitative.Light24,
custom_data=BETTERTRANSFORMER_DATA,
color="Quantization ποΈ",
points="all",
)
# add hover data
prefill_fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
)
)
# add layout
prefill_fig.update_layout(
title={
"text": "Prefill Latency Speedup per Architecture",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="LLM Architecture",
yaxis_title="Prefill Speedup (%)",
legend_title="Quantization Scheme",
width=1200,
height=600,
)
return prefill_fig
def create_bt_plots(llm_perf_df):
# descriptive text
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
# get figures
prefill_fig = get_bt_prefill_fig(llm_perf_df)
decode_fig = get_bt_decode_fig(llm_perf_df)
# create plots
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
return prefill_plot, decode_plot
|