BenchmarkBot's picture
weird behavior
5c9f565
raw
history blame
No virus
13.5 kB
import os
import gradio as gr
import pandas as pd
import plotly.express as px
from apscheduler.schedulers.background import BackgroundScheduler
from src.assets.css_html_js import custom_css
from src.assets.text_content import (
TITLE,
INTRODUCTION_TEXT,
ABOUT_TEXT,
EXAMPLE_CONFIG_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
)
from src.utils import (
restart_space,
load_dataset_repo,
process_model_name,
process_model_type,
)
HARDWARE_NAMES = ["A100-80GB", "RTX4090-24GB"]
HARDWARES_EMOJIS = ["πŸ–₯️", "πŸ’»"]
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
ALL_COLUMNS_MAPPING = {
"backend.name": "Backend 🏭",
"backend.torch_dtype": "Dtype πŸ“₯",
"optimizations": "Optimizations πŸ› οΈ",
"quantization": "Quantization πŸ—œοΈ",
#
"weight_class": "Class πŸ‹οΈ",
"model_type": "Type πŸ€—",
#
"generate.peak_memory(MB)": "Memory (MB) ⬇️",
"generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
"best_score": "Best Score (%) ⬆️",
#
"best_scored_model": "Best Scored LLM πŸ†",
}
ALL_COLUMNS_DATATYPES = [
"str",
"str",
"str",
"str",
#
"str",
"str",
#
"number",
"number",
"number",
"str",
#
"markdown",
]
NO_DUPLICATES_COLUMNS = [
"backend.name",
"backend.torch_dtype",
"optimizations",
"quantization",
#
"weight_class",
"model_type",
]
SORTING_COLUMN = ["best_score", "generate.latency(s)", "generate.peak_memory(MB)"]
SORTING_ASCENDING = [False, True, True]
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
def get_benchmark_df(benchmark="Succeeded-1xA100-80GB"):
if llm_perf_dataset_repo:
llm_perf_dataset_repo.git_pull()
# load data
benchmark_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
clusters_df = pd.read_csv("./llm-perf-dataset/Clustered-Open-LLM-Leaderboard.csv")
# merge on model
merged_df = benchmark_df.merge(
clusters_df, left_on="model", right_on="best_scored_model"
)
# transpose energy consumption
merged_df["generate.energy_consumption(tokens/kWh)"] = 1 / merged_df[
"generate.energy_consumption(kWh/token)"
].fillna(1)
# fix nan values
merged_df[merged_df["generate.energy_consumption(tokens/kWh)"] == 1] = "N/A"
# add optimizations
merged_df["optimizations"] = merged_df["backend.bettertransformer"].apply(
lambda x: "BetterTransformer" if x else "None"
)
# add quantization scheme
merged_df["quantization"] = merged_df["backend.quantization_strategy"].apply(
lambda x: "BnB.4bit" if x == "bnb" else ("GPTQ.4bit" if x == "gptq" else "None")
)
# sort
merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
# drop duplicates
merged_df.drop_duplicates(subset=NO_DUPLICATES_COLUMNS, inplace=True)
return merged_df
def get_benchmark_table(bench_df):
copy_df = bench_df.copy()
# filter
copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
# rename
copy_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
# transform
copy_df["Type πŸ€—"] = copy_df["Type πŸ€—"].apply(process_model_type)
copy_df["Best Scored LLM πŸ†"] = copy_df["Best Scored LLM πŸ†"].apply(
process_model_name
)
# process quantization
copy_df["Best Score (%) ⬆️"] = copy_df.apply(
lambda x: f"{x['Best Score (%) ⬆️']}**"
if x["Quantization πŸ—œοΈ"] in ["BnB.4bit", "GPTQ.4bit"]
else x["Best Score (%) ⬆️"],
axis=1,
)
return copy_df
def get_benchmark_chart(bench_df):
copy_df = bench_df.copy()
# filter latency bigger than 100s
copy_df["generate.latency(s)"] = copy_df["generate.latency(s)"].apply(
lambda x: x if isinstance(x, (int, float)) else 1000
)
copy_df = copy_df[copy_df["generate.latency(s)"] <= 100]
# rename model_type
copy_df["model_type"] = copy_df["model_type"].apply(process_model_type)
fig = px.scatter(
copy_df,
y="best_score",
x="generate.latency(s)",
size="generate.peak_memory(MB)",
color="model_type",
custom_data=list(ALL_COLUMNS_MAPPING.keys()),
color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.update_layout(
title={
"text": "Latency vs. Score vs. Memory",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Per 1000 tokens Latency (s)",
yaxis_title="Open LLM Score (%)",
legend_title="LLM Type",
width=1200,
height=600,
)
fig.update_traces(
hovertemplate="<br>".join(
[
f"<b>{ALL_COLUMNS_MAPPING[key]}:</b> %{{customdata[{i}]}}"
for i, key in enumerate(ALL_COLUMNS_MAPPING.keys())
]
)
)
return fig
def filter_query(
text,
backends,
datatypes,
optimizations,
quantization_scheme,
score,
memory,
hardware,
):
raw_df = get_benchmark_df(benchmark=f"Succeeded-1x{hardware}")
filtered_df = raw_df[
raw_df["best_scored_model"].str.lower().str.contains(text.lower())
& raw_df["backend.name"].isin(backends)
& raw_df["backend.torch_dtype"].isin(datatypes)
& (
pd.concat(
[
raw_df["optimizations"].str.contains(optimization)
for optimization in optimizations
],
axis=1,
).any(axis="columns")
if len(optimizations) > 0
else True
)
& (
pd.concat(
[
raw_df["quantization"] == quantization
for quantization in quantization_scheme
],
axis=1,
).any(axis="columns")
if len(quantization_scheme) > 0
else True
)
& (raw_df["best_score"] >= score)
& (raw_df["forward.peak_memory(MB)"] <= memory)
]
filtered_table = get_benchmark_table(filtered_df)
filtered_chart = get_benchmark_chart(filtered_df)
return filtered_table, filtered_chart
# Demo interface
demo = gr.Blocks(css=custom_css)
with demo:
# leaderboard title
gr.HTML(TITLE)
# introduction text
gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
with gr.Tabs(elem_classes="leaderboard-tabs"):
hardware_placeholders = {}
hardware_tables = {}
hardware_plots = {}
####################### HARDWARE TABS #######################
for i, (hardware, emoji) in enumerate(zip(HARDWARE_NAMES, HARDWARES_EMOJIS)):
# dummy placeholder of the hardware name
hardware_placeholders[hardware] = gr.Textbox(value=hardware, visible=False)
with gr.TabItem(f"{hardware} {emoji}", id=i):
with gr.Tabs(elem_classes="hardware-tabs"):
# placeholder for full dataframe
hardware_df = get_benchmark_df(benchmark=f"Succeeded-1x{hardware}")
with gr.TabItem("Leaderboard πŸ…", id=0):
gr.HTML(
"πŸ‘‰ Scroll to the right πŸ‘‰ for additional columns.",
elem_id="descriptive-text",
)
# Original leaderboard table
hardware_tables[hardware] = gr.components.Dataframe(
value=get_benchmark_table(hardware_df),
headers=list(ALL_COLUMNS_MAPPING.values()),
datatype=ALL_COLUMNS_DATATYPES,
elem_id="hardware-table",
# show_label=False,
)
with gr.TabItem("Plot πŸ“Š", id=1):
gr.HTML(
"πŸ‘† Hover over the points πŸ‘† for additional information.",
elem_id="descriptive-text",
)
# Original leaderboard plot
hardware_plots[hardware] = gr.components.Plot(
value=get_benchmark_chart(hardware_df),
elem_id="hardware-plot",
show_label=False,
)
####################### CONTROL PANEL #######################
with gr.TabItem("Control Panel πŸŽ›οΈ", id=2):
gr.HTML(
"Use this control panel to filter the leaderboard's table and plot.", # noqa: E501
elem_id="descriptive-text",
)
with gr.Row():
with gr.Column():
search_bar = gr.Textbox(
label="Model πŸ€—",
info="πŸ” Search for a model name",
elem_id="search-bar",
)
with gr.Row():
with gr.Column(scale=1):
with gr.Box():
score_slider = gr.Slider(
label="Open LLM Score πŸ“ˆ",
info="🎚️ Slide to minimum Open LLM score",
value=0,
elem_id="threshold-slider",
)
with gr.Column(scale=1):
with gr.Box():
memory_slider = gr.Slider(
label="Peak Memory (MB) πŸ“ˆ",
info="🎚️ Slide to maximum Peak Memory",
minimum=0,
maximum=80 * 1024,
value=80 * 1024,
elem_id="memory-slider",
)
with gr.Column(scale=1):
backend_checkboxes = gr.CheckboxGroup(
label="Backends 🏭",
choices=["pytorch", "onnxruntime"],
value=["pytorch", "onnxruntime"],
info="β˜‘οΈ Select the backends",
elem_id="backend-checkboxes",
)
with gr.Row():
with gr.Column(scale=1):
datatype_checkboxes = gr.CheckboxGroup(
label="Load Dtypes πŸ“₯",
choices=["float32", "float16"],
value=["float32", "float16"],
info="β˜‘οΈ Select the load dtypes",
elem_id="dtype-checkboxes",
)
with gr.Column(scale=1):
optimizations_checkboxes = gr.CheckboxGroup(
label="Optimizations πŸ› οΈ",
choices=["None", "BetterTransformer"],
value=["None", "BetterTransformer"],
info="β˜‘οΈ Select the optimizations",
elem_id="optimizations-checkboxes",
)
with gr.Column(scale=1):
quantization_checkboxes = gr.CheckboxGroup(
label="Quantizations πŸ—œοΈ",
choices=["None", "BnB.4bit", "GPTQ.4bit"],
value=["None", "BnB.4bit", "GPTQ.4bit"],
info="β˜‘οΈ Select the quantization schemes",
elem_id="quantization-checkboxes",
)
with gr.Row():
filter_button = gr.Button(
value="Filter πŸš€",
elem_id="filter-button",
)
for hardware in HARDWARE_NAMES:
filter_button.click(
filter_query,
[
search_bar,
backend_checkboxes,
datatype_checkboxes,
optimizations_checkboxes,
quantization_checkboxes,
score_slider,
memory_slider,
hardware_placeholders[hardware],
],
[hardware_tables[hardware], hardware_plots[hardware]],
)
####################### ABOUT TAB #######################
with gr.TabItem("About πŸ“–", id=3):
gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text")
gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text")
####################### CITATION #######################
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
# Restart space every hour
scheduler = BackgroundScheduler()
scheduler.add_job(
restart_space,
"interval",
seconds=3600,
args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
)
scheduler.start()
# Launch demo
demo.queue(concurrency_count=10).launch()