Alan Liu commited on
Commit
ed50ee5
1 Parent(s): 6aa1c8b

add generation arithmetic intensity

Browse files
Files changed (2) hide show
  1. app.py +9 -6
  2. calc_util.py +43 -29
app.py CHANGED
@@ -57,7 +57,7 @@ subtotal_operations = [
57
 
58
 
59
 
60
- col1, col2, col3, col4, col5 = st.columns([1,1.5,2.3,2.3,0.1])
61
 
62
  inference_config = {}
63
  parameter_count = {}
@@ -144,7 +144,7 @@ with col3: # Prefilling
144
 
145
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
146
  subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
147
- prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key]) for key in prefilling_activation_memory_count}
148
  prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}
149
 
150
 
@@ -182,15 +182,18 @@ with col4: # Generation
182
 
183
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
184
  subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
 
185
  generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}
186
 
187
  ## Convert dictionaries to pandas dataframes for table display
188
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
189
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
190
 
191
- #df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
192
- #df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)
193
-
 
 
194
  header4("Inference Ops: Generation")
195
  st.markdown(create_table(df_operation_count))
196
 
@@ -199,7 +202,7 @@ with col4: # Generation
199
  st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
200
  st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
201
  st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
202
- #st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
203
 
204
  if inference_config['KV_cache']:
205
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
 
57
 
58
 
59
 
60
+ col1, col2, col3, col4, col5 = st.columns([1,1.5,2.5,2.5,0.1])
61
 
62
  inference_config = {}
63
  parameter_count = {}
 
144
 
145
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
146
  subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
147
+ prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key] if prefilling_activation_memory_count[key]>0 else float('inf')) for key in prefilling_activation_memory_count}
148
  prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}
149
 
150
 
 
182
 
183
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
184
  subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
185
+ generation_arithmetic_intensity = {key: "{:.3f}".format(generation_operation_count[key]/generation_activation_memory_count[key] if generation_activation_memory_count[key]>0 else float('inf')) for key in generation_activation_memory_count}
186
  generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}
187
 
188
  ## Convert dictionaries to pandas dataframes for table display
189
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
190
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
191
 
192
+ df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
193
+ df_operation_count["Arithmetic Intensity"] = df_operation_count["Operation"].map(generation_arithmetic_intensity)
194
+ df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)
195
+ df_subtotal_operation_count["Arithmetic Intensity"] = df_subtotal_operation_count["Operation"].map(generation_arithmetic_intensity)
196
+
197
  header4("Inference Ops: Generation")
198
  st.markdown(create_table(df_operation_count))
199
 
 
202
  st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
203
  st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
204
  st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
205
+ st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
206
 
207
  if inference_config['KV_cache']:
208
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
calc_util.py CHANGED
@@ -184,8 +184,8 @@ def attention_softmax_activation_memory(model_config, inference_config, seq_leng
184
  per_head_per_layer = (2 * inference_config['batchsize'] * seq_length * seq_length)
185
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
186
 
187
- def attention_multV_activation_memory(model_config, inference_config, seq_length):
188
- per_head_per_layer = inference_config['batchsize'] * seq_length * seq_length + 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size_per_head']
189
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
190
 
191
  def attention_out_activation_memory(model_config, inference_config, seq_length):
@@ -215,7 +215,7 @@ def prefilling_activation_memory(model_config, inference_config):
215
  activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
216
  activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
217
  activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
218
- activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
219
  activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
220
 
221
  activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
@@ -243,31 +243,50 @@ def prefilling_activation_memory(model_config, inference_config):
243
 
244
  return activation_memory
245
 
246
-
247
  def generation_activation_memory(model_config, inference_config):
248
- # TODO Check how KV cache affects activation_memory
249
  activation_memory = {}
250
-
251
- activation_memory['word_embedding'] = word_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
252
- activation_memory['positional_embedding'] = positional_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
253
-
254
- activation_memory['attention_Q'] = attention_Q_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
255
- activation_memory['attention_K'] = attention_K_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
256
- activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
257
- activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'], inference_config['input_seq_length'] + inference_config['output_seq_length'])
258
- activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
259
- activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
260
- activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
261
-
262
- activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
263
-
264
- activation_memory['mlp1'] = mlp1_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
265
- activation_memory['mlp2'] = mlp2_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
266
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
268
  activation_memory['attention'] = (
269
- activation_memory['attention_Q'] + activation_memory['attention_K'] +
270
- activation_memory['attention_V'] + activation_memory['attention_QK'] +
271
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
272
  activation_memory['attention_out']
273
  )
@@ -276,10 +295,5 @@ def generation_activation_memory(model_config, inference_config):
276
  activation_memory['embeddings'] + activation_memory['attention'] +
277
  activation_memory['mlp'] + activation_memory['layernorm']
278
  )
279
-
280
- activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
281
- activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
282
- activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
283
- activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
284
 
285
  return activation_memory
 
184
  per_head_per_layer = (2 * inference_config['batchsize'] * seq_length * seq_length)
185
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
186
 
187
+ def attention_multV_activation_memory(model_config, inference_config, seq_length_Q, seq_length_V):
188
+ per_head_per_layer = inference_config['batchsize'] * seq_length_Q * seq_length_V + inference_config['batchsize'] * seq_length_Q * model_config['hidden_size_per_head'] + inference_config['batchsize'] * seq_length_V * model_config['hidden_size_per_head']
189
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
190
 
191
  def attention_out_activation_memory(model_config, inference_config, seq_length):
 
215
  activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
216
  activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
217
  activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
218
+ activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
219
  activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
220
 
221
  activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
 
243
 
244
  return activation_memory
245
 
 
246
  def generation_activation_memory(model_config, inference_config):
 
247
  activation_memory = {}
248
+
249
+ activation_memory['word_embedding'] = 0
250
+ activation_memory['positional_embedding'] = 0
251
+ activation_memory['attention_K'] = 0
252
+ activation_memory['attention_V'] = 0
253
+ activation_memory['attention_Q'] = 0
254
+ activation_memory['attention_QK'] = 0
255
+ activation_memory['attention_softmax'] = 0
256
+ activation_memory['attention_multV'] = 0
257
+ activation_memory['attention_out'] = 0
258
+ activation_memory['mlp1'] = 0
259
+ activation_memory['mlp2'] = 0
260
+ activation_memory['layernorm'] = 0
261
+
262
+ for t in range(inference_config['output_seq_length']):
263
+ if inference_config['KV_cache']:
264
+ activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, 1)
265
+ activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, 1)
266
+ activation_memory['attention_Q'] += attention_Q_activation_memory(model_config, inference_config, 1)
267
+ activation_memory['attention_QK'] += attention_QK_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_K=(t+1)+inference_config['input_seq_length'])
268
+ activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, 1)
269
+ activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
270
+ activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, 1)
271
+ activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, 1)
272
+ activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, 1)
273
+ else:
274
+ activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
275
+ activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
276
+ activation_memory['attention_Q'] += attention_Q_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
277
+ activation_memory['attention_QK'] += attention_QK_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_K=(t+1)+inference_config['input_seq_length'])
278
+ activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
279
+ activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
280
+ activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
281
+ activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
282
+ activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
283
+
284
+ activation_memory['layernorm'] += layernorm_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
285
+
286
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
287
  activation_memory['attention'] = (
288
+ activation_memory['attention_K'] + activation_memory['attention_V'] +
289
+ activation_memory['attention_Q'] + activation_memory['attention_QK'] +
290
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
291
  activation_memory['attention_out']
292
  )
 
295
  activation_memory['embeddings'] + activation_memory['attention'] +
296
  activation_memory['mlp'] + activation_memory['layernorm']
297
  )
 
 
 
 
 
298
 
299
  return activation_memory