jerryzh168's picture
Add torchao int4 weight only quantization as an option
3795233
raw
history blame
9.58 kB
from typing import List
import gradio as gr
from src.leaderboard import get_leaderboard_df
from src.llm_perf import get_llm_perf_df
# from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
# from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
from src.map import get_lat_score_mem_fig
def create_control_panel(
machine: str,
subsets: List[str],
backends: List[str],
hardware_provider: str,
hardware_type: str,
):
# controls
machine_value = gr.State(value=machine)
subsets_value = gr.State(value=subsets)
backends_value = gr.State(value=backends)
hardware_type_value = gr.State(value=hardware_type)
if hardware_provider == "nvidia":
backends = ["pytorch"]
attention_implementations = ["Eager", "SDPA", "FAv2"]
quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
kernels = [
"No Kernel",
"GPTQ.ExllamaV1",
"GPTQ.ExllamaV2",
"AWQ.GEMM",
"AWQ.GEMV",
]
elif hardware_provider == "intel":
backends = ["pytorch", "onnxruntime", "openvino"]
attention_implementations = ["Eager"]
quantizations = ["Unquantized"]
kernels = ["No Kernel"]
else:
raise ValueError(f"Unknown hardware provider: {hardware_provider}")
with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
with gr.Row():
with gr.Column(scale=2, variant="panel"):
score_slider = gr.Slider(
label="Open LLM Score (%) πŸ“ˆ",
info="🎚️ Slide to minimum Open LLM score",
value=0,
elem_id="threshold-slider",
)
with gr.Column(scale=2, variant="panel"):
memory_slider = gr.Slider(
label="Peak Memory (MB) πŸ“ˆ",
info="🎚️ Slide to maximum Peak Memory",
minimum=0,
maximum=80 * 1024,
value=80 * 1024,
elem_id="memory-slider",
)
with gr.Column(scale=1, variant="panel"):
backend_checkboxes = gr.CheckboxGroup(
label="Backends 🏭",
choices=backends,
value=backends,
info="β˜‘οΈ Select the backends",
elem_id="backend-checkboxes",
)
with gr.Row():
with gr.Column(scale=1, variant="panel"):
datatype_checkboxes = gr.CheckboxGroup(
label="Precision πŸ“₯",
choices=["float32", "float16", "bfloat16"],
value=["float32", "float16", "bfloat16"],
info="β˜‘οΈ Select the load data types",
elem_id="dtype-checkboxes",
)
with gr.Column(scale=1, variant="panel"):
optimization_checkboxes = gr.CheckboxGroup(
label="Attentions πŸ‘οΈ",
choices=attention_implementations,
value=attention_implementations,
info="β˜‘οΈ Select the optimization",
elem_id="optimization-checkboxes",
)
with gr.Row():
with gr.Column(scale=1, variant="panel"):
quantization_checkboxes = gr.CheckboxGroup(
label="Quantizations πŸ—œοΈ",
choices=quantizations,
value=quantizations,
info="β˜‘οΈ Select the quantization schemes",
elem_id="quantization-checkboxes",
elem_classes="boxed-option",
)
with gr.Column(scale=1, variant="panel"):
kernels_checkboxes = gr.CheckboxGroup(
label="Kernels βš›οΈ",
choices=kernels,
value=kernels,
info="β˜‘οΈ Select the custom kernels",
elem_id="kernel-checkboxes",
elem_classes="boxed-option",
)
with gr.Row():
filter_button = gr.Button(
value="Filter πŸš€",
elem_id="filter-button",
elem_classes="boxed-option",
)
return (
filter_button,
machine_value,
backends_value,
hardware_type_value,
subsets_value,
score_slider,
memory_slider,
backend_checkboxes,
datatype_checkboxes,
optimization_checkboxes,
quantization_checkboxes,
kernels_checkboxes,
)
def filter_rows_fn(
machine,
subsets,
backends,
hardware_type,
# inputs
score,
memory,
backend_checkboxes,
precisions,
attentions,
quantizations,
kernels,
# interactive
columns,
search,
):
llm_perf_df = get_llm_perf_df(
machine=machine, subsets=subsets, backends=backends, hardware_type=hardware_type
)
# print(attentions)
# print(llm_perf_df["Attention πŸ‘οΈ"].unique())
filtered_llm_perf_df = llm_perf_df[
llm_perf_df["Model πŸ€—"].str.contains(search, case=False)
& llm_perf_df["Backend 🏭"].isin(backend_checkboxes)
& llm_perf_df["Precision πŸ“₯"].isin(precisions)
& llm_perf_df["Attention πŸ‘οΈ"].isin(attentions)
& llm_perf_df["Quantization πŸ—œοΈ"].isin(quantizations)
& llm_perf_df["Kernel βš›οΈ"].isin(kernels)
& (llm_perf_df["Open LLM Score (%)"] >= score)
& (llm_perf_df["Memory (MB)"] <= memory)
]
selected_filtered_llm_perf_df = select_columns_fn(
machine, subsets, backends, hardware_type, columns, search, filtered_llm_perf_df
)
selected_filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_llm_perf_df)
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
# filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
# filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
# filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
# filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
# filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
return [
selected_filtered_llm_perf_df,
selected_filtered_lat_score_mem_fig,
# filtered_bt_prefill_fig,
# filtered_bt_decode_fig,
# filtered_fa2_prefill_fig,
# filtered_fa2_decode_fig,
# filtered_quant_prefill_fig,
# filtered_quant_decode_fig,
]
def create_control_callback(
# button
filter_button,
# fixed
machine_value,
subsets_value,
backends_value,
hardware_type_value,
# inputs
score_slider,
memory_slider,
backend_checkboxes,
datatype_checkboxes,
optimization_checkboxes,
quantization_checkboxes,
kernels_checkboxes,
# interactive
columns_checkboxes,
search_bar,
# outputs
leaderboard_table,
lat_score_mem_plot,
# attn_prefill_plot,
# attn_decode_plot,
# fa2_prefill_plot,
# fa2_decode_plot,
# quant_prefill_plot,
# quant_decode_plot,
):
filter_button.click(
fn=filter_rows_fn,
inputs=[
# fixed
machine_value,
subsets_value,
backends_value,
hardware_type_value,
# inputs
score_slider,
memory_slider,
backend_checkboxes,
datatype_checkboxes,
optimization_checkboxes,
quantization_checkboxes,
kernels_checkboxes,
# interactive
columns_checkboxes,
search_bar,
],
outputs=[
leaderboard_table,
lat_score_mem_plot,
# attn_prefill_plot,
# attn_decode_plot,
# fa2_prefill_plot,
# fa2_decode_plot,
# quant_prefill_plot,
# quant_decode_plot,
],
)
def select_columns_fn(
machine, subsets, backends, hardware_type, columns, search, llm_perf_df=None
):
if llm_perf_df is None:
llm_perf_df = get_llm_perf_df(
machine=machine,
subsets=subsets,
backends=backends,
hardware_type=hardware_type,
)
selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
selected_leaderboard_df = selected_leaderboard_df[
selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
]
selected_leaderboard_df = selected_leaderboard_df[columns]
return selected_leaderboard_df
def create_select_callback(
# fixed
machine_value,
subsets_value,
backends_value,
hardware_type_value,
# interactive
columns_checkboxes,
search_bar,
# outputs
leaderboard_table,
):
columns_checkboxes.change(
fn=select_columns_fn,
inputs=[
machine_value,
subsets_value,
backends_value,
hardware_type_value,
columns_checkboxes,
search_bar,
],
outputs=[leaderboard_table],
)
search_bar.change(
fn=select_columns_fn,
inputs=[
machine_value,
subsets_value,
backends_value,
hardware_type_value,
columns_checkboxes,
search_bar,
],
outputs=[leaderboard_table],
)