Alan Liu commited on
Commit
5f0df3a
1 Parent(s): 989cd20

add prefill memory

Browse files
Files changed (3) hide show
  1. .streamlit/config.toml +1 -0
  2. app.py +26 -5
  3. calc_util.py +136 -7
.streamlit/config.toml ADDED
@@ -0,0 +1 @@
 
 
1
+ [theme]
app.py CHANGED
@@ -57,13 +57,17 @@ subtotal_operations = [
57
 
58
 
59
 
60
- col1, col2, col3, col4, col5 = st.columns(5)
61
 
62
  inference_config = {}
63
  parameter_count = {}
64
  cached_parameter_count = {}
 
65
  prefilling_operation_count = {}
66
  generation_operation_count = {}
 
 
 
67
  gpu_config = {}
68
  inference_info = {}
69
 
@@ -77,6 +81,7 @@ with col1:
77
  model_config['intermediate_size'] = st.number_input('intermediate size', value=model_config['intermediate_size'], format ="%d")
78
  model_config['vocab_size'] = st.number_input('vocab size', value= model_config['vocab_size'], format ="%d")
79
  model_config['max_position_embeddings'] = st.number_input('max position embeddings', value=model_config['max_position_embeddings'], format ="%d")
 
80
 
81
  header4("Inference Setting")
82
  inference_config['batchsize'] = st.number_input('batchsize', value=1, format ="%d")
@@ -131,43 +136,57 @@ with col2:
131
 
132
  with col3: # Prefilling
133
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
134
- inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*10**12)
 
135
  inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
 
136
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
137
 
138
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
139
  subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
 
140
 
141
  ## Convert dictionaries to pandas dataframes for table display
142
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
143
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
144
-
 
 
 
145
  header4("Inference Ops: Prefilling")
146
  st.markdown(create_table(df_operation_count))
147
 
148
  header5("Summary: Prefilling")
149
  st.markdown(create_table(df_subtotal_operation_count))
150
  st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
 
 
151
 
152
  if inference_config['KV_cache']:
153
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
154
 
155
 
156
 
157
- with col4: # Prefilling
158
  generation_operation_count = generation_operation(model_config, inference_config)
159
- inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*10**12)
 
160
  inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
161
  inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
 
162
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
163
 
164
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
165
  subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
 
166
 
167
  ## Convert dictionaries to pandas dataframes for table display
168
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
169
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
170
 
 
 
 
171
  header4("Inference Ops: Generation")
172
  st.markdown(create_table(df_operation_count))
173
 
@@ -175,6 +194,8 @@ with col4: # Prefilling
175
  st.markdown(create_table(df_subtotal_operation_count))
176
  st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
177
  st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
 
 
178
 
179
  if inference_config['KV_cache']:
180
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
 
57
 
58
 
59
 
60
+ col1, col2, col3, col4, col5 = st.columns([1,1.5,2,2,2])
61
 
62
  inference_config = {}
63
  parameter_count = {}
64
  cached_parameter_count = {}
65
+
66
  prefilling_operation_count = {}
67
  generation_operation_count = {}
68
+ prefilling_memory_count = {}
69
+ generation_memory_count = {}
70
+
71
  gpu_config = {}
72
  inference_info = {}
73
 
 
81
  model_config['intermediate_size'] = st.number_input('intermediate size', value=model_config['intermediate_size'], format ="%d")
82
  model_config['vocab_size'] = st.number_input('vocab size', value= model_config['vocab_size'], format ="%d")
83
  model_config['max_position_embeddings'] = st.number_input('max position embeddings', value=model_config['max_position_embeddings'], format ="%d")
84
+ model_config['hidden_size_per_head'] = model_config['hidden_size']/model_config['num_attention_heads']
85
 
86
  header4("Inference Setting")
87
  inference_config['batchsize'] = st.number_input('batchsize', value=1, format ="%d")
 
136
 
137
  with col3: # Prefilling
138
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
139
+ prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
140
+ inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
141
  inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
142
+ inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
143
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
144
 
145
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
146
  subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
147
+ prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}
148
 
149
  ## Convert dictionaries to pandas dataframes for table display
150
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
151
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
152
+
153
+ df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(prefilling_activation_memory_count)
154
+ df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(prefilling_activation_memory_count)
155
+
156
  header4("Inference Ops: Prefilling")
157
  st.markdown(create_table(df_operation_count))
158
 
159
  header5("Summary: Prefilling")
160
  st.markdown(create_table(df_subtotal_operation_count))
161
  st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
162
+ st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
163
+ st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
164
 
165
  if inference_config['KV_cache']:
166
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
167
 
168
 
169
 
170
+ with col4: # Generation
171
  generation_operation_count = generation_operation(model_config, inference_config)
172
+ generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
173
+ inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
174
  inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
175
  inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
176
+ inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
177
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
178
 
179
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
180
  subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
181
+ generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}
182
 
183
  ## Convert dictionaries to pandas dataframes for table display
184
  df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
185
  df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
186
 
187
+ #df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
188
+ #df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)
189
+
190
  header4("Inference Ops: Generation")
191
  st.markdown(create_table(df_operation_count))
192
 
 
194
  st.markdown(create_table(df_subtotal_operation_count))
195
  st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
196
  st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
197
+ st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
198
+ #st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
199
 
200
  if inference_config['KV_cache']:
201
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
calc_util.py CHANGED
@@ -32,23 +32,23 @@ def positional_embedding_operation(model_config, inference_config):
32
  ### Below three are the same
33
  def attention_K_operation(model_config, inference_config, seq_length):
34
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
35
- B = [model_config['hidden_size'], model_config['hidden_size']/model_config['num_attention_heads']]
36
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
37
 
38
  def attention_Q_operation(model_config, inference_config, seq_length):
39
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
40
- B = [model_config['hidden_size'], model_config['hidden_size']/model_config['num_attention_heads']]
41
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
42
 
43
  def attention_V_operation(model_config, inference_config, seq_length):
44
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
45
- B = [model_config['hidden_size'], model_config['hidden_size']/model_config['num_attention_heads']]
46
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
47
 
48
  ##
49
  def attention_QK_operation(model_config, inference_config, seq_length_Q, seq_length_K):
50
- A = [inference_config['batchsize'], seq_length_Q, model_config['hidden_size']/model_config['num_attention_heads']]
51
- B = [model_config['hidden_size']/model_config['num_attention_heads'], seq_length_K]
52
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
53
 
54
  def attention_softmax_operation(model_config, inference_config,seq_length):
@@ -59,7 +59,7 @@ def attention_softmax_operation(model_config, inference_config,seq_length):
59
 
60
  def attention_multV_operation(model_config, inference_config, seq_length_Q, seq_length_V):
61
  A = [inference_config['batchsize'], seq_length_Q, seq_length_V]
62
- B = [seq_length_V, model_config['hidden_size']/model_config['num_attention_heads']]
63
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
64
 
65
  def attention_out_operation(model_config, inference_config, seq_length):
@@ -153,4 +153,133 @@ def generation_operation(model_config, inference_config):
153
  generation_operation_count['mlp'] = generation_operation_count['mlp1'] + generation_operation_count['mlp2']
154
  generation_operation_count['total'] = (generation_operation_count['attention'] + generation_operation_count['mlp'] + generation_operation_count['layernorm'])
155
 
156
- return generation_operation_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  ### Below three are the same
33
  def attention_K_operation(model_config, inference_config, seq_length):
34
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
35
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
36
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
37
 
38
  def attention_Q_operation(model_config, inference_config, seq_length):
39
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
40
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
41
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
42
 
43
  def attention_V_operation(model_config, inference_config, seq_length):
44
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
45
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
46
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
47
 
48
  ##
49
  def attention_QK_operation(model_config, inference_config, seq_length_Q, seq_length_K):
50
+ A = [inference_config['batchsize'], seq_length_Q, model_config['hidden_size_per_head']]
51
+ B = [model_config['hidden_size_per_head'], seq_length_K]
52
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
53
 
54
  def attention_softmax_operation(model_config, inference_config,seq_length):
 
59
 
60
  def attention_multV_operation(model_config, inference_config, seq_length_Q, seq_length_V):
61
  A = [inference_config['batchsize'], seq_length_Q, seq_length_V]
62
+ B = [seq_length_V, model_config['hidden_size_per_head']]
63
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
64
 
65
  def attention_out_operation(model_config, inference_config, seq_length):
 
153
  generation_operation_count['mlp'] = generation_operation_count['mlp1'] + generation_operation_count['mlp2']
154
  generation_operation_count['total'] = (generation_operation_count['attention'] + generation_operation_count['mlp'] + generation_operation_count['layernorm'])
155
 
156
+ return generation_operation_count
157
+
158
+
159
+ def word_embedding_activation_memory(model_config, inference_config, seq_length):
160
+ return inference_config['batchsize'] * seq_length * (model_config['vocab_size'] + model_config['hidden_size'])
161
+
162
+ def positional_embedding_activation_memory(model_config, inference_config, seq_length):
163
+ return 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size']
164
+
165
+ def attention_K_activation_memory(model_config, inference_config, seq_length):
166
+ per_head_per_layer = inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['hidden_size_per_head'])
167
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
168
+
169
+ def attention_V_activation_memory(model_config, inference_config, seq_length):
170
+ per_head_per_layer = inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['hidden_size_per_head'])
171
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
172
+
173
+ def attention_Q_activation_memory(model_config, inference_config, seq_length):
174
+ per_head_per_layer = inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['hidden_size_per_head'])
175
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
176
+
177
+ def attention_QK_activation_memory(model_config, inference_config, seq_length_Q, seq_length_K):
178
+ inputs_Q = inference_config['batchsize'] * seq_length_Q * model_config['hidden_size_per_head']
179
+ inputs_K = inference_config['batchsize'] * seq_length_K * model_config['hidden_size_per_head']
180
+ outputs = inference_config['batchsize'] * seq_length_Q * seq_length_K
181
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * (inputs_Q + inputs_K + outputs)
182
+
183
+ def attention_softmax_activation_memory(model_config, inference_config, seq_length):
184
+ per_head_per_layer = (2 * inference_config['batchsize'] * seq_length * seq_length)
185
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
186
+
187
+ def attention_multV_activation_memory(model_config, inference_config, seq_length):
188
+ per_head_per_layer = inference_config['batchsize'] * seq_length * seq_length + 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size_per_head']
189
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
190
+
191
+ def attention_out_activation_memory(model_config, inference_config, seq_length):
192
+ per_head_per_layer = 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size']
193
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * per_head_per_layer
194
+
195
+ def layernorm_activation_memory(model_config, inference_config, seq_length):
196
+ per_layernorm_per_layer = 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size']
197
+ return model_config['num_hidden_layers'] * model_config['layernorm_operation'] * per_layernorm_per_layer
198
+
199
+ def mlp1_activation_memory(model_config, inference_config, seq_length):
200
+ per_layer = inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['intermediate_size'])
201
+ return model_config['num_hidden_layers'] * per_layer
202
+
203
+ def mlp2_activation_memory(model_config, inference_config, seq_length):
204
+ per_layer = inference_config['batchsize'] * seq_length * (model_config['intermediate_size'] + model_config['hidden_size'])
205
+ return model_config['num_hidden_layers'] * per_layer
206
+
207
+ def prefilling_activation_memory(model_config, inference_config):
208
+ activation_memory = {}
209
+
210
+ activation_memory['word_embedding'] = word_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
211
+ activation_memory['positional_embedding'] = positional_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
212
+
213
+ activation_memory['attention_Q'] = attention_Q_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
214
+ activation_memory['attention_K'] = attention_K_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
215
+ activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
216
+ activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'], inference_config['input_seq_length'])
217
+ activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
218
+ activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
219
+ activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
220
+
221
+ activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
222
+
223
+ activation_memory['mlp1'] = mlp1_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
224
+ activation_memory['mlp2'] = mlp2_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
225
+
226
+ activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
227
+ activation_memory['attention'] = (
228
+ activation_memory['attention_Q'] + activation_memory['attention_K'] +
229
+ activation_memory['attention_V'] + activation_memory['attention_QK'] +
230
+ activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
231
+ activation_memory['attention_out']
232
+ )
233
+ activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
234
+ activation_memory['total'] = (
235
+ activation_memory['embeddings'] + activation_memory['attention'] +
236
+ activation_memory['mlp'] + activation_memory['layernorm']
237
+ )
238
+
239
+ activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
240
+ activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
241
+ activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
242
+ activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
243
+
244
+ return activation_memory
245
+
246
+
247
+ def generation_activation_memory(model_config, inference_config):
248
+ # TODO Check how KV cache affects activation_memory
249
+ activation_memory = {}
250
+
251
+ activation_memory['word_embedding'] = word_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
252
+ activation_memory['positional_embedding'] = positional_embedding_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
253
+
254
+ activation_memory['attention_Q'] = attention_Q_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
255
+ activation_memory['attention_K'] = attention_K_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
256
+ activation_memory['attention_V'] = attention_V_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
257
+ activation_memory['attention_QK'] = attention_QK_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'], inference_config['input_seq_length'] + inference_config['output_seq_length'])
258
+ activation_memory['attention_softmax'] = attention_softmax_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
259
+ activation_memory['attention_multV'] = attention_multV_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
260
+ activation_memory['attention_out'] = attention_out_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
261
+
262
+ activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
263
+
264
+ activation_memory['mlp1'] = mlp1_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
265
+ activation_memory['mlp2'] = mlp2_activation_memory(model_config, inference_config, inference_config['input_seq_length'] + inference_config['output_seq_length'])
266
+
267
+ activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
268
+ activation_memory['attention'] = (
269
+ activation_memory['attention_Q'] + activation_memory['attention_K'] +
270
+ activation_memory['attention_V'] + activation_memory['attention_QK'] +
271
+ activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
272
+ activation_memory['attention_out']
273
+ )
274
+ activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
275
+ activation_memory['total'] = (
276
+ activation_memory['embeddings'] + activation_memory['attention'] +
277
+ activation_memory['mlp'] + activation_memory['layernorm']
278
+ )
279
+
280
+ activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
281
+ activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
282
+ activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
283
+ activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
284
+
285
+ return activation_memory