# %% import json import pandas as pd import plotly.express as px import plotly.graph_objects as go import plotly.io as pio from plotly.subplots import make_subplots # %% # Read the json file # This file processes the llm_gpu_benchmark.json file in the tmp/inputs folder # File is generated using the command # curl -sSL https://raw.githubusercontent.com/h2oai/h2ogpt/main/benchmarks/perf.json | jq -s '.' > llm_gpu_benchmarks.json with open('llm_gpu_benchmarks.json') as f: data = json.load(f) del f # %% # Read the json file into a dataframe df = pd.json_normalize(data) del data # %% # Process the dataframe # Drop columns that are not needed df.drop(columns=['task', 'ngpus', 'reps', 'date', 'git_sha', 'transformers', 'bitsandbytes', 'cuda', 'hostname', 'summarize_input_len_bytes'], inplace=True) # Rename columns df.rename(columns={'n_gpus': 'gpu_count'}, inplace=True) # Split the gpu column into gpu and gpu_memory df["gpu_name"] = df.gpus.str.extract(r'[1-9] x ([\w\- ]+) .+') df["gpu_memory_gb"] = round( pd.to_numeric(df.gpus.str.extract(r'[\w ]+ \(([\d]+) .+', expand=False), errors='coerce') / 1024) df["gpu_memory_gb"] = df["gpu_memory_gb"].astype('Int64') df.drop(columns=['gpus'], inplace=True) # Manage gpu_names df.gpu_name = df.gpu_name.str.replace('NVIDIA ', '') df.gpu_name = df.gpu_name.str.replace('GeForce ', '') df.gpu_name = df.gpu_name.str.replace('A100-SXM4-80GB', 'A100 SXM4') df.gpu_name = df.gpu_memory_gb.astype(str) + "-" + df.gpu_name # Remove CPUs df.drop(df[df.gpu_name.isnull()].index, inplace=True) # %% # Remove duplicate rows df.drop_duplicates(['backend', 'base_model', 'bits', 'gpu_count', 'gpu_name'], inplace=True) # %% Add baseline comparison columns # Looking at the CPU data for 4, 8, and 16 bit quantization values for the benchmark we are simplifying it to a single # value cpu_summary_out_throughput = 1353 / 1216 # bytes/second (calculated from summarize_output_len_bytes / summarize_time) cpu_generate_out_throughput = 849 / 180 # bytes/second (calculated from generate_output_len_bytes / generate_time) # add GPU throughput columns df["summary_out_throughput"] = df.summarize_output_len_bytes / df.summarize_time df["generate_out_throughput"] = df.generate_output_len_bytes / df.generate_time # add GPU throughput boost columns df["summary_out_throughput_normalize"] = df.summary_out_throughput / cpu_summary_out_throughput df["generate_out_throughput_normalize"] = df.generate_out_throughput / cpu_generate_out_throughput # %% # df.to_excel('tmp/scratchpad/output/llm_gpu_benchmarks.xlsx', index=False) # %% pio.renderers.default = "browser" # %% bits_bar_colors = {'4': px.colors.qualitative.D3[0], '8': px.colors.qualitative.D3[1], '16': px.colors.qualitative.D3[2]} backends = list(df.backend.unique()) base_models = list(df.base_model.unique()) n_gpus = list(df.gpu_count.unique()) # %% for backend in backends: # for backend in ['transformers']: fig_bar = make_subplots(rows=len(n_gpus), cols=len(base_models) * 2, shared_xaxes='all', shared_yaxes='columns', start_cell="top-left", vertical_spacing=0.1, print_grid=False, row_titles=[f'{gpu_count} GPUs' for gpu_count in n_gpus], column_titles=['llama2-7b-chat Summarization', 'llama2-7b-chat Generation', 'llama2-13b-chat Summarization', 'llama2-13b-chat Generation', 'llama2-70b-chat Summarization', 'llama2-70b-chat Generation'],) # for base_model in ['h2oai/h2ogpt-4096-llama2-7b-chat']: for base_model in base_models: for gpu_count in n_gpus: for bits in sorted(df.bits.unique()): sub_df = df[(df.backend == backend) & (df.base_model == base_model) & (df.gpu_count == gpu_count) & (df.bits == bits)].sort_values(by='gpu_name') fig_bar.add_trace(go.Bar(x=sub_df.summary_out_throughput_normalize, y=sub_df.gpu_name, name=f'sum-{bits} bits', legendgroup=f'sum-{bits} bits', marker=dict(color=bits_bar_colors[f'{bits}']), orientation='h'), row=n_gpus.index(gpu_count) + 1, col=base_models.index(base_model) * 2 + 1) fig_bar.add_trace(go.Bar(x=sub_df.generate_out_throughput_normalize, y=sub_df.gpu_name, name=f'gen-{bits} bits', legendgroup=f'gen-{bits} bits', marker=dict(color=bits_bar_colors[f'{bits}']), orientation='h'), row=list(n_gpus).index(gpu_count) + 1, col=list(base_models).index(base_model) * 2 + 2) fig_bar.update_layout(plot_bgcolor='rgb(250,250,250)', showlegend=True, barmode="group") # fig_bar.show() fig_bar.write_html(f'llm_gpu_benchmark_{backend}.html', include_plotlyjs='cdn')