import matplotlib matplotlib.use('Agg') import functools import gradio as gr import matplotlib.pyplot as plt import seaborn as sns import pandas as pd FIGURE_PATH = "plt.png" FIG_DPI = 300 def get_plot(task, gpu, omit_offload): # slice the dataframe according to the inputs df = pd.read_csv("data.csv") df = df[df["task"] == task] df = df[df["gpu"] == gpu] if omit_offload == "Yes": df = df[df["offload"] == 0] # combine model name and dtype df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ') # fuse the two columns to be compared (original and assisted generation) df = df.melt( id_vars=["task", "gpu", "model and dtype", "offload"], value_vars=["Greedy", "Assisted"], var_name="generation_type", value_name="generation_time", ) g = sns.catplot( data=df, kind="bar", x="model and dtype", y="generation_time", hue="generation_type", palette={"Greedy": "blue", "Assisted": "orange"}, alpha=.9, ) g.despine(left=True) g.set_axis_labels("Model size and dtype", "Latency (ms/token)") g.set_xticklabels(fontsize=7) g.set_yticklabels(fontsize=7) g.legend.set_title("Generation Type") plt.setp(g._legend.get_texts(), fontsize='7') # for legend text # Add the number to the top of each bar ax = g.facet_axis(0, 0) for i in ax.containers: ax.bar_label(i, fontsize=7) plt.savefig(FIGURE_PATH, dpi=FIG_DPI) return FIGURE_PATH demo = gr.Blocks() with demo: gr.Markdown( """ # Assisted Generation Benchmark """ ) # components shared across tabs omit_offload_fn = functools.partial( gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True ) def gpu_selector_fn(gpu_list): return gr.Dropdown( gpu_list, value=gpu_list[-1], label="GPU", interactive=True ) with gr.Tabs(): with gr.TabItem("OPT: Open"): plot_fn = functools.partial(get_plot, "OPT: Open Text Generation") with gr.Row(): with gr.Column(): gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) with gr.Column(): omit_offload = omit_offload_fn() # Show plot when the gradio app is initialized plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) gr.Markdown( """ ### Assistant Model - `facebook/opt-125m` ### Model Names: - 1.3B: `facebook/opt-1.3b` - 6.7B: `facebook/opt-6.7b` - 30B: `facebook/opt-30b` - 66B: `facebook/opt-66b` ### Dataset used as input prompt: - C4 (en, validation set) """ ) # Update plot when any of the inputs change plot_inputs = [gpu_selector, omit_offload] gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) with gr.TabItem("OPT: Summ"): plot_fn = functools.partial(get_plot, "OPT: Summarization") with gr.Row(): with gr.Column(scale=0.3, min_width=200): gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) omit_offload = omit_offload_fn() gr.Markdown( """ ### Assistant Model - `facebook/opt-125m` ### Model Names: - 1.3B: `facebook/opt-1.3b` - 6.7B: `facebook/opt-6.7b` - 30B: `facebook/opt-30b` - 66B: `facebook/opt-66b` ### Dataset used as input prompt: - CNN Dailymail (3.0.0, validation set) """ ) # Show plot when the gradio app is initialized plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) # Update plot when any of the inputs change plot_inputs = [gpu_selector, omit_offload] gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) with gr.TabItem("Whisper: ARS"): plot_fn = functools.partial(get_plot, "Whisper: ARS") with gr.Row(): with gr.Column(scale=0.3, min_width=200): gpu_selector = gpu_selector_fn(["3090", "T4"]) omit_offload = omit_offload_fn() gr.Markdown( """ ### Assistant Model - `openai/whisper-tiny` ### Model Names: - large-v2: `openai/whisper-large-v2` ### Dataset used as input prompt: - Librispeech ARS (clean, validation set) """ ) # Show plot when the gradio app is initialized plot = gr.Image(value=plot_fn("T4", "No")) # Update plot when any of the inputs change plot_inputs = [gpu_selector, omit_offload] gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) with gr.TabItem("CodeGen: Code"): plot_fn = functools.partial(get_plot, "CodeGen: Code Generation") with gr.Row(): with gr.Column(scale=0.3, min_width=200): gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) omit_offload = omit_offload_fn() gr.Markdown( """ ### Assistant Model - `Salesforce/codegen-350M-mono` ### Model Names: - 2B: `Salesforce/codegen-2B-mono` - 6B: `Salesforce/codegen-6B-mono` - 16B: `Salesforce/codegen-16B-mono` ### Dataset used as input prompt: - The Stack (python) """ ) # Show plot when the gradio app is initialized plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) # Update plot when any of the inputs change plot_inputs = [gpu_selector, omit_offload] gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) with gr.TabItem("Flan-T5: Summ"): plot_fn = functools.partial(get_plot, "Flan-T5: Summarization") with gr.Row(): with gr.Column(scale=0.3, min_width=200): gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) omit_offload = omit_offload_fn() gr.Markdown( """ ### Assistant Model - `google/flan-t5-small` ### Model Names: - large: `google/flan-t5-large` - xl: `google/flan-t5-xl` - xxl: `google/flan-t5-xxl` - ul2: `google/flan-ul2` ### Dataset used as input prompt: - CNN Dailymail (3.0.0, validation set) """ ) # Show plot when the gradio app is initialized plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) # Update plot when any of the inputs change plot_inputs = [gpu_selector, omit_offload] gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) with gr.TabItem("Benchmark Info"): gr.Dataframe( headers=["Parameter", "Value"], value=[ ["Transformers Version", "4.29dev0"], ["Pytorch Version", "2.0.0"], ["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"], ["CUDA", "11.8 (3090) / 11.3 (others GPUs)"], ["Number of input samples", "20-100 (depending on the model size)"], ["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"], ], ) demo.launch()