BenchmarkBot's picture
added llm perf score
223c247
raw
history blame
10.5 kB
import os
import math
import gradio as gr
import pandas as pd
import plotly.express as px
from apscheduler.schedulers.background import BackgroundScheduler
from src.assets.text_content import (
TITLE,
INTRODUCTION_TEXT,
SINGLE_A100_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
)
from src.utils import (
change_tab,
restart_space,
load_dataset_repo,
make_clickable_model,
# make_clickable_score,
# num_to_str,
)
from src.assets.css_html_js import custom_css, custom_js
LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
COLUMNS_MAPPING = {
"model": "Model πŸ€—",
"backend.name": "Backend 🏭",
"backend.torch_dtype": "Load Dtype πŸ“₯",
"optimizations": "Optimizations πŸ› οΈ",
#
"perf": "Open LLM-Perf Score ⬆️",
#
"generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
"score": "Open LLM Score ⬆️",
"forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
"num_params": "#️⃣ Parameters (M) πŸ“",
}
COLUMNS_DATATYPES = [
"markdown",
"str",
"str",
"str",
#
"number",
"number",
#
"number",
"number",
"number",
]
SORTING_COLUMN = ["Open LLM-Perf Score ⬆️"]
llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
def get_benchmark_df(benchmark="1xA100-80GB"):
if llm_perf_dataset_repo:
llm_perf_dataset_repo.git_pull()
# load and merge
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
scores_df = pd.read_csv(f"./llm-perf-dataset/reports/open-llm-leaderboard.csv")
bench_df = bench_df.merge(scores_df, on="model", how="left")
# filter out models with no score
bench_df = bench_df[bench_df["score"].notna()]
# create composite score
score_distance = 100 - bench_df["score"]
latency_distance = bench_df["generate.latency(s)"]
bench_df["perf"] = 1 / math.sqrt(score_distance**2 + latency_distance**2)
# normalize between 0 and 100
bench_df["perf"] = (
(bench_df["perf"] - bench_df["perf"].min())
/ (bench_df["perf"].max() - bench_df["perf"].min())
* 100
)
# round to 2 decimals
bench_df["perf"] = bench_df["perf"].round(2)
# add optimizations
bench_df["optimizations"] = bench_df[
["backend.bettertransformer", "backend.load_in_8bit", "backend.load_in_4bit"]
].apply(
lambda x: ", ".join(
filter(
lambda x: x != "",
[
"BetterTransformer" if x[0] == True else "",
"LLM.int8" if x[1] == True else "",
"LLM.fp4" if x[2] == True else "",
],
),
)
if any([x[0] == True, x[1] == True, x[2] == True])
else "None",
axis=1,
)
return bench_df
def get_benchmark_table(bench_df):
# filter
bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
# rename
bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
# sort
bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
# transform
bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
bench_df["#️⃣ Parameters (M) πŸ“"] = bench_df["#️⃣ Parameters πŸ“"].apply(
lambda x: int(x / (1024 * 1024))
)
return bench_df
def get_benchmark_plot(bench_df):
# untill falcon gets fixed / natively supported
bench_df = bench_df[bench_df["generate.latency(s)"] < 150]
fig = px.scatter(
bench_df,
x="generate.latency(s)",
y="score",
color="model_type",
symbol="backend.name",
size="forward.peak_memory(MB)",
custom_data=[
"model",
"backend.name",
"backend.torch_dtype",
"optimizations",
"forward.peak_memory(MB)",
"generate.throughput(tokens/s)",
],
symbol_sequence=["triangle-up", "circle"],
color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.update_layout(
title={
"text": "Model Score vs. Latency vs. Memory",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Per 1000 Tokens Latency (s)",
yaxis_title="Open LLM Score",
legend_title="Model Type and Backend",
width=1200,
height=600,
)
fig.update_traces(
hovertemplate="<br>".join(
[
"Model: %{customdata[0]}",
"Backend: %{customdata[1]}",
"Datatype: %{customdata[2]}",
"Optimizations: %{customdata[3]}",
"Peak Memory (MB): %{customdata[4]}",
"Throughput (tokens/s): %{customdata[5]}",
"Per 1000 Tokens Latency (s): %{x}",
"Open LLM Score: %{y}",
]
)
)
return fig
def filter_query(
text,
backends,
datatypes,
optimizations,
score,
memory,
benchmark="1xA100-80GB",
):
raw_df = get_benchmark_df(benchmark=benchmark)
filtered_df = raw_df[
raw_df["model"].str.lower().str.contains(text.lower())
& raw_df["backend.name"].isin(backends)
& raw_df["backend.torch_dtype"].isin(datatypes)
& (
pd.concat(
[
raw_df["optimizations"].str.contains(optimization)
for optimization in optimizations
],
axis=1,
).any(axis="columns")
if len(optimizations) > 0
else True
)
& (raw_df["score"] >= score)
& (raw_df["forward.peak_memory(MB)"] <= memory)
]
filtered_table = get_benchmark_table(filtered_df)
filtered_plot = get_benchmark_plot(filtered_df)
return filtered_table, filtered_plot
# Dataframes
single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
single_A100_table = get_benchmark_table(single_A100_df)
single_A100_plot = get_benchmark_plot(single_A100_df)
# Demo interface
demo = gr.Blocks(css=custom_css)
with demo:
# leaderboard title
gr.HTML(TITLE)
# introduction text
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# control panel title
gr.HTML("<h2>Control Panel πŸŽ›οΈ</h2>")
# control panel interface
with gr.Row():
with gr.Column(scale=1):
search_bar = gr.Textbox(
label="Model πŸ€—",
info="πŸ” Search for a model name",
elem_id="search-bar",
)
with gr.Column(scale=1):
with gr.Box():
score_slider = gr.Slider(
label="Open LLM Score πŸ“ˆ",
info="🎚️ Slide to minimum Open LLM score",
value=0,
elem_id="threshold-slider",
)
with gr.Column(scale=1):
with gr.Box():
memory_slider = gr.Slider(
label="Peak Memory (MB) πŸ“ˆ",
info="🎚️ Slide to maximum Peak Memory",
minimum=0,
maximum=80 * 1024,
value=80 * 1024,
elem_id="memory-slider",
)
with gr.Row():
with gr.Column(scale=1):
backend_checkboxes = gr.CheckboxGroup(
label="Backends 🏭",
choices=["pytorch", "onnxruntime"],
value=["pytorch", "onnxruntime"],
info="β˜‘οΈ Select the backends",
elem_id="backend-checkboxes",
)
with gr.Column(scale=1):
datatype_checkboxes = gr.CheckboxGroup(
label="Datatypes πŸ“₯",
choices=["float32", "float16"],
value=["float32", "float16"],
info="β˜‘οΈ Select the load datatypes",
elem_id="datatype-checkboxes",
)
with gr.Column(scale=2):
optimizations_checkboxes = gr.CheckboxGroup(
label="Optimizations πŸ› οΈ",
choices=["None", "BetterTransformer", "LLM.int8", "LLM.fp4"],
value=["None", "BetterTransformer", "LLM.int8", "LLM.fp4"],
info="β˜‘οΈ Select the optimizations",
elem_id="optimizations-checkboxes",
)
with gr.Row():
filter_button = gr.Button(
value="Filter πŸš€",
elem_id="filter-button",
)
# leaderboard tabs
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ–₯️ A100-80GB Leaderboard πŸ†", id=0):
gr.HTML(SINGLE_A100_TEXT)
# Original leaderboard table
single_A100_leaderboard = gr.components.Dataframe(
value=single_A100_table,
datatype=COLUMNS_DATATYPES,
headers=list(COLUMNS_MAPPING.values()),
elem_id="1xA100-table",
)
with gr.TabItem("πŸ–₯️ A100-80GB Plot πŸ“Š", id=1):
# Original leaderboard plot
gr.HTML(SINGLE_A100_TEXT)
# Original leaderboard plot
single_A100_plotly = gr.components.Plot(
value=single_A100_plot,
elem_id="1xA100-plot",
show_label=False,
)
filter_button.click(
filter_query,
[
search_bar,
backend_checkboxes,
datatype_checkboxes,
optimizations_checkboxes,
score_slider,
memory_slider,
],
[single_A100_leaderboard, single_A100_plotly],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
dummy = gr.Textbox(visible=False)
demo.load(
change_tab,
dummy,
tabs,
_js=custom_js,
)
# Restart space every hour
scheduler = BackgroundScheduler()
scheduler.add_job(
restart_space,
"interval",
seconds=3600,
args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
)
scheduler.start()
# Launch demo
demo.queue(concurrency_count=40).launch()