Spaces:

optimum
/

llm-perf-leaderboard

Running

File size: 4,270 Bytes

import gradio as gr
import pandas as pd
import plotly.express as px


BETTERTRANSFORMER_DATA = [
    # open llm
    "Model 🤗",
    "Arch 🏛️",
    "DType 📥",
    "Backend 🏭",
    "Params (B)",
    "Open LLM Score (%)",
    # deployment settings
    "DType 📥",
    "Backend 🏭",
    "Quantization 🗜️",
    # primary measurements
    "Prefill Latency (s)",
    "Prefill Latency (s) BetterTransformer",
    "Decode Throughput (tokens/s)",
    "Decode Throughput (tokens/s) BetterTransformer",
    "E2E Throughput (tokens/s)",
    "E2E Throughput (tokens/s) BetterTransformer",
    # speedups
    "Prefill Latency Speedup (%)",
    "Decode Throughput Speedup (%)",
]


def get_bt_df(llm_perf_df):
    bt_df = llm_perf_df.copy()
    # seperate original model experiments from BetterTransformer experiments
    original_df = bt_df[(bt_df["Optimization 🛠️"] == "None") & (bt_df["DType 📥"] == "float16")]
    bt_df = bt_df[bt_df["Optimization 🛠️"] == "BetterTransformer"]
    # merge the two dataframes
    bt_df = pd.merge(
        original_df,
        bt_df,
        on=["Model 🤗", "Quantization 🗜️"],
        suffixes=["", " BetterTransformer"],
    )
    # compute speedups
    bt_df["Prefill Latency Speedup (%)"] = (
        (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
    ).round(2) - 100
    bt_df["Decode Throughput Speedup (%)"] = (
        (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
    ).round(2) - 100
    # filter speedups > 1000%
    bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
    bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]

    return bt_df


def get_bt_decode_fig(llm_perf_df):
    bt_df = get_bt_df(llm_perf_df)
    # plot
    decode_fig = px.box(
        bt_df,
        x="Arch 🏛️",
        y="Decode Throughput Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=BETTERTRANSFORMER_DATA,
        color="Quantization 🗜️",
        points="all",
    )
    # add hover data
    decode_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
        )
    )
    # add layout
    decode_fig.update_layout(
        title={
            "text": "Decode Throughput Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Decode Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return decode_fig


def get_bt_prefill_fig(llm_perf_df):
    bt_df = get_bt_df(llm_perf_df)
    # plot
    prefill_fig = px.box(
        bt_df,
        x="Arch 🏛️",
        y="Prefill Latency Speedup (%)",
        color_discrete_sequence=px.colors.qualitative.Light24,
        custom_data=BETTERTRANSFORMER_DATA,
        color="Quantization 🗜️",
        points="all",
    )
    # add hover data
    prefill_fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
        )
    )
    # add layout
    prefill_fig.update_layout(
        title={
            "text": "Prefill Latency Speedup per Architecture",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="LLM Architecture",
        yaxis_title="Prefill Speedup (%)",
        legend_title="Quantization Scheme",
        width=1200,
        height=600,
    )

    return prefill_fig


def create_bt_plots(llm_perf_df):
    # descriptive text
    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
    # get figures
    prefill_fig = get_bt_prefill_fig(llm_perf_df)
    decode_fig = get_bt_decode_fig(llm_perf_df)

    # create plots
    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)

    return prefill_plot, decode_plot