Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF staff commited on Jan 7

Commit

2460b35

•

1 Parent(s): bb5689a

added exllama v2

Browse files

Files changed (4) hide show

app.py +7 -1
src/bettertransformer.py +0 -1
src/control_panel.py +4 -0
src/exllama.py +148 -0

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ import gradio as gr
 from src.control_panel import create_control_panel, create_control_callback
 from src.latency_score_memory import create_lat_score_mem_plot
 from src.leaderboard import create_leaderboard_table
-from src.flashattentionv2 import create_fa2_plots
 from src.bettertransformer import create_bt_plots
 from src.llm_perf import get_llm_perf_df
 from src.assets import custom_css
 from src.content import (
@@ -59,6 +60,9 @@ with demo:
                         bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
                     with gr.TabItem("FlashAttentionV2 Speedup 📈", id=3):
                         fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
                     filter_button,
@@ -78,6 +82,8 @@ with demo:
                     bt_decode_plot,
                     fa2_prefill_plot,
                     fa2_decode_plot,
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):

 from src.control_panel import create_control_panel, create_control_callback
 from src.latency_score_memory import create_lat_score_mem_plot
 from src.leaderboard import create_leaderboard_table
 from src.bettertransformer import create_bt_plots
+from src.flashattentionv2 import create_fa2_plots
+from src.exllama import create_exllama_plots
 from src.llm_perf import get_llm_perf_df
 from src.assets import custom_css
 from src.content import (
                         bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
                     with gr.TabItem("FlashAttentionV2 Speedup 📈", id=3):
                         fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
+                    with gr.TabItem("Exllama Speedup 📈", id=4):
+                        exllama_prefill_plot, exllama_decode_plot = create_exllama_plots(llm_perf_df)
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
                     filter_button,
                     bt_decode_plot,
                     fa2_prefill_plot,
                     fa2_decode_plot,
+                    exllama_prefill_plot,
+                    exllama_decode_plot,
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):

src/bettertransformer.py CHANGED Viewed

@@ -47,7 +47,6 @@ def get_bt_df(llm_perf_df):
     bt_df["Decode Throughput Speedup (%)"] = (
         (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
     bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
     bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]

     bt_df["Decode Throughput Speedup (%)"] = (
         (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
     bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
     bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]

src/control_panel.py CHANGED Viewed

@@ -144,6 +144,8 @@ def create_control_callback(
     bt_decode_plot,
     fa2_prefill_plot,
     fa2_decode_plot,
 ):
     filter_button.click(
         fn=filter_fn,
@@ -164,5 +166,7 @@ def create_control_callback(
             bt_decode_plot,
             fa2_prefill_plot,
             fa2_decode_plot,
         ],
     )

     bt_decode_plot,
     fa2_prefill_plot,
     fa2_decode_plot,
+    exllama_prefill_plot,
+    exllama_decode_plot,
 ):
     filter_button.click(
         fn=filter_fn,
             bt_decode_plot,
             fa2_prefill_plot,
             fa2_decode_plot,
+            exllama_prefill_plot,
+            exllama_decode_plot,
         ],
     )

src/exllama.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+EXLLAMA_DATA = [
+    # open llm
+    "Model 🤗",
+    "Arch 🏛️",
+    "DType 📥",
+    "Backend 🏭",
+    "Params (B)",
+    "Open LLM Score (%)",
+    # deployment settings
+    "DType 📥",
+    "Backend 🏭",
+    "Quantization 🗜️",
+    # primary measurements
+    "Prefill Latency (s)",
+    "Prefill Latency (s) Exllama",
+    "Decode Throughput (tokens/s)",
+    "Decode Throughput (tokens/s) Exllama",
+    "E2E Throughput (tokens/s)",
+    "E2E Throughput (tokens/s) Exllama",
+    # speedups
+    "Prefill Latency Speedup (%)",
+    "Decode Throughput Speedup (%)",
+]
+def get_exllama_df(llm_perf_df):
+    exllama_df = llm_perf_df.copy()
+    # seperate original model experiments from Exllama experiments
+    gptq_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit")]
+    exllamav1_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
+    exllamav2_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
+    # merge the three dataframes
+    exllamav1_df = pd.merge(
+        gptq_df,
+        exllamav1_df,
+        on=["Model 🤗"],
+        suffixes=["", " Exllama"],
+    )
+    exllamav2_df = pd.merge(
+        gptq_df,
+        exllamav2_df,
+        on=["Model 🤗"],
+        suffixes=["", " Exllama"],
+    )
+    # concat the two dataframes row-wise
+    exllama_df = pd.concat([exllamav1_df, exllamav2_df])
+    exllama_df["Quantization 🗜️"] = exllama_df["Quantization 🗜️ Exllama"]
+    # compute speedups
+    exllama_df["Prefill Latency Speedup (%)"] = (
+        (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100
+    ).round(2) - 100
+    exllama_df["Decode Throughput Speedup (%)"] = (
+        (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100
+    ).round(2) - 100
+    # filter speedups > 1000%
+    exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000]
+    exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000]
+    return exllama_df
+def get_exllama_decode_fig(llm_perf_df):
+    exllama_df = get_exllama_df(llm_perf_df)
+    # plot
+    decode_fig = px.box(
+        exllama_df,
+        x="Arch 🏛️",
+        y="Decode Throughput Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=EXLLAMA_DATA,
+        color="Quantization 🗜️ Exllama",
+        points="all",
+    )
+    # add hover data
+    decode_fig.update_traces(
+        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
+    )
+    # add layout
+    decode_fig.update_layout(
+        title={
+            "text": "Decode Throughput Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Decode Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return decode_fig
+def get_exllama_prefill_fig(llm_perf_df):
+    exllama_df = get_exllama_df(llm_perf_df)
+    # plot
+    prefill_fig = px.box(
+        exllama_df,
+        x="Arch 🏛️",
+        y="Prefill Latency Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=EXLLAMA_DATA,
+        color="Quantization 🗜️ Exllama",
+        points="all",
+    )
+    # add hover data
+    prefill_fig.update_traces(
+        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)])
+    )
+    # add layout
+    prefill_fig.update_layout(
+        title={
+            "text": "Prefill Latency Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Prefill Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return prefill_fig
+def create_exllama_plots(llm_perf_df):
+    # descriptive text
+    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
+    # get figures
+    prefill_fig = get_exllama_prefill_fig(llm_perf_df)
+    decode_fig = get_exllama_decode_fig(llm_perf_df)
+    # create plots
+    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
+    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
+    return prefill_plot, decode_plot