Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

transformer_inference / app.py

harmdevries

Update app.py

6c2da96 over 2 years ago

raw

history blame contribute delete

8.74 kB

	import streamlit as st

	st.sidebar.header("Transformer parameters")
	col1, col2 = st.sidebar.columns([2, 4])

	bs = st.sidebar.number_input('Batch size', value=10)
	h = st.sidebar.number_input('Num heads',value=16)
	d = st.sidebar.number_input('Dimension', value=768)
	l = st.sidebar.number_input('Num layers', value=24)

	n_start = st.sidebar.number_input('Start seq', value=1)
	n = st.sidebar.number_input('End seq', value=1024)

	st.sidebar.header("GPU parameters")

	GPU = st.sidebar.selectbox('GPU', ('A100', 'V100'))

	if GPU == 'A100':
	# A100 specs
	TFLOPS = 312e12
	GB_S = 1935e9
	elif GPU == 'V100':
	TFLOPS = 112e12
	GB_S = 900e9
	else:
	raise ValueError('Unknown GPU')

	# in ms

	THREAD_OVERHEAD = st.sidebar.number_input('Thread overhead (in ms)', format="%.3f", value=0.005)
	GPU_EFFICIENCY = st.sidebar.number_input('GPU efficiency', format="%.3f", value=0.5)

	TFLOPS = GPU_EFFICIENCY*TFLOPS

	# in ms
	def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
	exec_time = max(comp_flop/TFLOPS, mem_bytes/GB_S)
	exec_time *= 1000
	if include_overhead:
	exec_time = max(exec_time, THREAD_OVERHEAD)
	return exec_time

	def qkv_mha_exec(bs, h, n, d):
	flop = 2bs1d3*d
	nbytes = 2bs1d + 23dd + 2bs13d
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def qkv_mqa_exec(bs, h, n, d):
	flop = 2bs1d(1+2/h)*d
	nbytes = 2bs1d + 2(2/h)dd + 2bs1(2/h)d
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def att1_mha_exec(bs, h, n, d):
	flop = 2bsh(d/h)n
	nbytes = 2bsh(d/h) + 2bshn(d/h) + 2bshn
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def att1_mqa_exec(bs, h, n, d):
	flop = 2bsh(d/h)n
	nbytes = 2bsh(d/h) + 2bsn(d/h) + 2bsh*n
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def att2_mha_exec(bs, h, n, d):
	flop = 2bshn(d/h)
	nbytes = 2bshn + 2bshn(d/h) + 2bsh(d/h)
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def att2_mqa_exec(bs, h, n, d):
	flop = 2bshn(d/h)
	nbytes = 2bsn(d/h) + 2bsn(d/h) + 2bsh*(d/h)
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def out_exec(bs, h, n, d):
	flop = 2bs1dd
	nbytes = 2bs1d + 2dd + 2bs1d
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def softmax_exec(bs, h, n, d):
	flop = 0
	nbytes = 2bshn + 2bshn
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def ln_exec(bs, h, n, d):
	nbytes = 2bs1d + 2bs1d
	flop = 0
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def mlp_exec(bs, h, n, d):
	flop = 2bs1d4*d
	nbytes = 2bs1d + 2d4d + 2bs14d
	exec_time = calc_exec_time(flop, nbytes)
	return flop, nbytes, exec_time

	def print_kernel_execution(flop, nbytes):
	c1, c2 = st.columns([2, 3])
	exec_time = calc_exec_time(flop, nbytes, include_overhead=False)
	flop = round(flop/1e9, 2)
	nbytes = round(nbytes/1e6, 2)

	c1.write("GFLOP:")
	c2.write(str(flop))
	c1.write("MB: ")
	c2.write(str(nbytes))
	c1.write("Time (ms):")
	c2.write(str(exec_time))
	c1.write("Overhead (ms):")
	c2.write(str(THREAD_OVERHEAD))

	st.title("Inference time MHA vs MQA")
	st.write("This space approximates the inference time for Multi-Query Attention and Multi-Head Attention transformers. You can change the hyperparameters in sidebar.")

	mqa_total_time = 0.
	mha_total_time = 0.

	for i in range(n_start, n):
	shared_time = out_exec(bs, h, i, d)[2] + softmax_exec(bs, h, i , d)[2] + 2*ln_exec(bs, h, i, d)[2] \
	+ 2mlp_exec(bs, h, i, d)[2] + 3ln_exec(bs, h, i, d)[2]
	mha_time = shared_time + qkv_mha_exec(bs, h, i, d)[2] + att1_mha_exec(bs, h, i, d)[2] + att2_mha_exec(bs, h, i, d)[2]
	mha_total_time += l*mha_time
	mqa_time = shared_time + qkv_mqa_exec(bs, h, i, d)[2] + att1_mqa_exec(bs, h, i, d)[2] + att2_mqa_exec(bs, h, i, d)[2]
	mqa_total_time += l*mqa_time

	c1, c2 = st.columns([2, 4])
	c1.write("Multi-Head Attention:")
	c2.write(str(round(mha_total_time, 2)))
	c1.write("Multi-Query Attention:")
	c2.write(str(round(mqa_total_time, 2)))
	c1.write("Speed-up MQA over MHA:")
	c2.write(str(round(mha_total_time/mqa_total_time,2)))

	st.subheader("Memory consumption")
	st.caption("Multi-Head Attention")
	c1, c2 = st.columns([2, 4])
	num_params = 12ld*d
	c1.write("Num Parameters (in B)")
	c2.write(str(round(num_params/1e9, 3)))
	c1.write("Stored Parameters (GB)")
	c2.write(str(round(2*num_params/1e9, 3)))
	c1.write("Cached keys and values (GB)")
	acts = round(2bsl(d/h)h2n/1e9, 2)
	c2.write(str(acts))

	st.caption("Multi-Query Attention")
	c1, c2 = st.columns([2, 4])
	num_params = (10+2/h)ld*d
	c1.write("Num Parameters (in B)")
	c2.write(str(round(num_params/1e9, 3)))
	c1.write("Stored Parameters (GB)")
	c2.write(str(round(2*num_params/1e9, 3)))
	c1.write("Cached keys and values (GB)")
	acts = round(2bsl(d/h)2*n/1e9, 2)
	c2.write(str(acts))

	st.subheader("Estimating execution time")
	st.markdown("We use the [following crude approximation](https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#understand-perf) to estimate the execution time for each matrix multiplication.")

	st.latex("C = A \cdot B")
	st.latex("A \in \mathbb{R}^{MxK}, B \in R^{KxN}, C \in \mathbb{R}^{MxN}")

	st.markdown('''
	To execute this operation on the GPU, we need to
	1. Read A, B from memory
	2. Perform matrix multiplication
	3. Write C to memory
	''')

	st.markdown("For float16 operations (2 bytes), we can estimate the memory access time of A as follows:")
	st.latex("T_{mem}(A) = 2MK / BW_{mem}")
	st.markdown("where BW_mem is the memory bandwidth of the GPU (e.g. 1935 GB/s for an A100 GPU)")
	st.markdown("The total time on memory access is T_mem = T_mem(A) + T_mem(B) + T_mem(C)")

	st.markdown("We can estimate the compute time for the math operations as follows:")
	st.latex("T_{math}(A \cdot B) = 2MK*N / BW_{math}")
	st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")

	st.markdown("If we assume we can perfectly overlap memory access with math operations, then the estimated execution time for the operation is:")
	st.latex("max(T_{math}, T_{mem})")

	st.markdown("Note that there is a minimum time to execute the operation due to [kernel launch overhead](https://forums.developer.nvidia.com/t/any-way-to-measure-the-latency-of-a-kernel-launch/221413/2)")

	st.subheader("Inference time for Transformer operations")
	st.markdown("We can now estimate the execution for each of the operations in the transformer model. I suggest you inspect the code for details on the calculations. ")

	st.subheader('Attention layer')

	st.markdown('QKV projection')
	st.caption("Multi-Head Attention")
	flop, nbytes, exec_time = qkv_mha_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.caption("Multi-Query Attention")
	flop, nbytes, exec_time = qkv_mqa_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.markdown('QK gemm')
	st.write("Showing calculation for the maximum sequence length (n)")

	st.caption("Multi-Head Attention")
	flop, nbytes, exec_time = att1_mha_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.caption("Multi-Query Attention")
	flop, nbytes, exec_time = att1_mqa_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.markdown('Attention-value gemm')
	st.write("Showing calculation for the maximum sequence length (n)")
	st.caption("Multi-Head Attention")
	flop, nbytes, exec_time = att2_mha_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.caption("Multi-Query Attention")
	flop, nbytes, exec_time = att2_mqa_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.markdown('Output projection')
	flop, nbytes, exec_time = out_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.markdown('Element-wise ops')
	st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")

	st.caption("Softmax")
	flop, nbytes, exec_time = softmax_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.caption("Layer norm/residual connection")
	flop, nbytes, exec_time = ln_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.subheader('MLP layer')
	st.markdown('First and Second Linear Layer')
	flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)

	st.markdown('Element-wise ops')
	st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
	flop, nbytes, exec_time = ln_exec(bs, h, n, d)
	print_kernel_execution(flop, nbytes)