Spaces:
Runtime error
Runtime error
Commit
·
32aafee
1
Parent(s):
52a11ab
Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,7 @@ TFLOPS = GPU_EFFICIENCY*TFLOPS
|
|
34 |
|
35 |
# in ms
|
36 |
def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
|
37 |
-
exec_time = comp_flop/TFLOPS
|
38 |
exec_time *= 1000
|
39 |
if include_overhead:
|
40 |
exec_time = max(exec_time, THREAD_OVERHEAD)
|
@@ -169,24 +169,21 @@ st.latex("A \in \mathbb{R}^{MxK}, B \in R^{KxN}, C \in \mathbb{R}^{MxN}")
|
|
169 |
st.markdown('''
|
170 |
To execute this operation on the GPU, we need to
|
171 |
1. Read A, B from memory
|
172 |
-
2. Perform
|
173 |
3. Write C to memory
|
174 |
''')
|
175 |
|
176 |
-
|
177 |
-
st.latex('''
|
178 |
-
For float16 operations (2 bytes), we can estimate the memory access time of A as follows:
|
179 |
-
T_mem(A) = 2*M*K / BW_mem
|
180 |
-
where BW_mem is the memory bandwidth of the GPU (e.g. 1935 GB/s for A100)
|
181 |
-
''')
|
182 |
-
|
183 |
st.markdown("For float16 operations (2 bytes), we can estimate the memory access time of A as follows:")
|
184 |
st.latex("T_{mem}(A) = 2*M*K / BW_{mem}")
|
185 |
-
st.markdown("where BW_mem is the memory bandwidth of the GPU (e.g. 1935 GB/s for A100)")
|
186 |
-
|
187 |
-
|
188 |
|
|
|
|
|
|
|
189 |
|
|
|
|
|
190 |
|
191 |
breakdown = st.checkbox("Show breakdown per operation")
|
192 |
if breakdown:
|
|
|
34 |
|
35 |
# in ms
|
36 |
def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
|
37 |
+
exec_time = max(comp_flop/TFLOPS, mem_bytes/GB_S)
|
38 |
exec_time *= 1000
|
39 |
if include_overhead:
|
40 |
exec_time = max(exec_time, THREAD_OVERHEAD)
|
|
|
169 |
st.markdown('''
|
170 |
To execute this operation on the GPU, we need to
|
171 |
1. Read A, B from memory
|
172 |
+
2. Perform matrix multiplication
|
173 |
3. Write C to memory
|
174 |
''')
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
st.markdown("For float16 operations (2 bytes), we can estimate the memory access time of A as follows:")
|
177 |
st.latex("T_{mem}(A) = 2*M*K / BW_{mem}")
|
178 |
+
st.markdown("where BW_mem is the memory bandwidth of the GPU (e.g. 1935 GB/s for an A100 GPU)")
|
179 |
+
st.markdown("The total time on memory access is T_mem = T_mem(A) + T_mem(B) + T_mem(C)")
|
|
|
180 |
|
181 |
+
st.markdown("We can estimate the compute time for the math operations as follows:")
|
182 |
+
st.latex("T_{math}(A \cdot B) = 2*M*K*N / BW_{math}")
|
183 |
+
st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")
|
184 |
|
185 |
+
st.markdown("If we assume we can *perfectly* overlap memory access with math operations, then the estimated execution time for the operation is:")
|
186 |
+
st.latex("max(T_math, T_mem)")
|
187 |
|
188 |
breakdown = st.checkbox("Show breakdown per operation")
|
189 |
if breakdown:
|