Spaces:

iblfe
/

test

Runtime error

App Files Files Community

test / benchmarks /llm_gpu_benchmark.py

iblfe

Upload folder using huggingface_hub

b585c7f verified over 1 year ago

raw

history blame contribute delete

5.58 kB



	# %%
	import json

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import plotly.io as pio
	from plotly.subplots import make_subplots

	# %%
	# Read the json file
	# This file processes the llm_gpu_benchmark.json file in the tmp/inputs folder
	# File is generated using the command
	# curl -sSL https://raw.githubusercontent.com/h2oai/h2ogpt/main/benchmarks/perf.json \| jq -s '.' > llm_gpu_benchmarks.json
	with open('llm_gpu_benchmarks.json') as f:
	data = json.load(f)
	del f

	# %%
	# Read the json file into a dataframe
	df = pd.json_normalize(data)
	del data

	# %%
	# Process the dataframe
	# Drop columns that are not needed
	df.drop(columns=['task', 'ngpus', 'reps', 'date', 'git_sha', 'transformers', 'bitsandbytes', 'cuda', 'hostname',
	'summarize_input_len_bytes'], inplace=True)
	# Rename columns
	df.rename(columns={'n_gpus': 'gpu_count'}, inplace=True)
	# Split the gpu column into gpu and gpu_memory
	df["gpu_name"] = df.gpus.str.extract(r'[1-9] x ([\w\- ]+) .+')
	df["gpu_memory_gb"] = round(
	pd.to_numeric(df.gpus.str.extract(r'[\w ]+ \(([\d]+) .+', expand=False), errors='coerce') / 1024)
	df["gpu_memory_gb"] = df["gpu_memory_gb"].astype('Int64')
	df.drop(columns=['gpus'], inplace=True)
	# Manage gpu_names
	df.gpu_name = df.gpu_name.str.replace('NVIDIA ', '')
	df.gpu_name = df.gpu_name.str.replace('GeForce ', '')
	df.gpu_name = df.gpu_name.str.replace('A100-SXM4-80GB', 'A100 SXM4')
	df.gpu_name = df.gpu_memory_gb.astype(str) + "-" + df.gpu_name
	# Remove CPUs
	df.drop(df[df.gpu_name.isnull()].index, inplace=True)

	# %%
	# Remove duplicate rows
	df.drop_duplicates(['backend', 'base_model', 'bits', 'gpu_count', 'gpu_name'], inplace=True)

	# %% Add baseline comparison columns
	# Looking at the CPU data for 4, 8, and 16 bit quantization values for the benchmark we are simplifying it to a single
	# value
	cpu_summary_out_throughput = 1353 / 1216 # bytes/second (calculated from summarize_output_len_bytes / summarize_time)
	cpu_generate_out_throughput = 849 / 180 # bytes/second (calculated from generate_output_len_bytes / generate_time)

	# add GPU throughput columns
	df["summary_out_throughput"] = df.summarize_output_len_bytes / df.summarize_time
	df["generate_out_throughput"] = df.generate_output_len_bytes / df.generate_time
	# add GPU throughput boost columns
	df["summary_out_throughput_normalize"] = df.summary_out_throughput / cpu_summary_out_throughput
	df["generate_out_throughput_normalize"] = df.generate_out_throughput / cpu_generate_out_throughput

	# %%
	# df.to_excel('tmp/scratchpad/output/llm_gpu_benchmarks.xlsx', index=False)

	# %%
	pio.renderers.default = "browser"

	# %%
	bits_bar_colors = {'4': px.colors.qualitative.D3[0],
	'8': px.colors.qualitative.D3[1],
	'16': px.colors.qualitative.D3[2]}

	backends = list(df.backend.unique())
	base_models = list(df.base_model.unique())
	n_gpus = list(df.gpu_count.unique())

	# %%
	for backend in backends:
	# for backend in ['transformers']:
	fig_bar = make_subplots(rows=len(n_gpus),
	cols=len(base_models) * 2,
	shared_xaxes='all',
	shared_yaxes='columns',
	start_cell="top-left",
	vertical_spacing=0.1,
	print_grid=False,
	row_titles=[f'{gpu_count} GPUs' for gpu_count in n_gpus],
	column_titles=['llama2-7b-chat Summarization', 'llama2-7b-chat Generation',
	'llama2-13b-chat Summarization', 'llama2-13b-chat Generation',
	'llama2-70b-chat Summarization', 'llama2-70b-chat Generation'],)

	# for base_model in ['h2oai/h2ogpt-4096-llama2-7b-chat']:
	for base_model in base_models:
	for gpu_count in n_gpus:
	for bits in sorted(df.bits.unique()):
	sub_df = df[(df.backend == backend) &
	(df.base_model == base_model) &
	(df.gpu_count == gpu_count) &
	(df.bits == bits)].sort_values(by='gpu_name')
	fig_bar.add_trace(go.Bar(x=sub_df.summary_out_throughput_normalize,
	y=sub_df.gpu_name,
	name=f'sum-{bits} bits',
	legendgroup=f'sum-{bits} bits',
	marker=dict(color=bits_bar_colors[f'{bits}']),
	orientation='h'),
	row=n_gpus.index(gpu_count) + 1,
	col=base_models.index(base_model) * 2 + 1)
	fig_bar.add_trace(go.Bar(x=sub_df.generate_out_throughput_normalize,
	y=sub_df.gpu_name,
	name=f'gen-{bits} bits',
	legendgroup=f'gen-{bits} bits',
	marker=dict(color=bits_bar_colors[f'{bits}']),
	orientation='h'),
	row=list(n_gpus).index(gpu_count) + 1,
	col=list(base_models).index(base_model) * 2 + 2)

	fig_bar.update_layout(plot_bgcolor='rgb(250,250,250)',
	showlegend=True,
	barmode="group")
	# fig_bar.show()
	fig_bar.write_html(f'llm_gpu_benchmark_{backend}.html', include_plotlyjs='cdn')