Alan Liu commited on
Commit
cae5be9
2 Parent(s): 3849813 3732b01

Merge commit '3732b011219890e6d2b6935eba116dea31e10bfd'

Browse files
Files changed (3) hide show
  1. app.py +43 -25
  2. calc_util.py +138 -37
  3. model_util.py +124 -1
app.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import streamlit as st
4
  import pandas as pd
5
- from model_util import fetch_dictionary_content, load_parameter
6
  from calc_util import *
7
  from render_util import create_table, header4, header5
8
 
@@ -15,6 +15,9 @@ if 'model_config' not in st.session_state:
15
  def load_model_config(model_id):
16
  if 'model_id' in st.session_state['model_config'] and st.session_state['model_config']['model_id'] == model_id:
17
  return st.session_state['model_config']
 
 
 
18
  model_config = {}
19
  dictionary_content = fetch_dictionary_content(model_id)
20
  if dictionary_content:
@@ -27,7 +30,7 @@ def load_model_config(model_id):
27
  model_config['max_position_embeddings'] = dictionary_content['max_position_embeddings']
28
  model_config['layernorm_operation'] = 2
29
  else:
30
- st.warning("Model Info is not public!")
31
  model_config['model_id'] = 'opt-1.3b'
32
  model_config['hidden_size'] = 2048
33
  model_config['num_attention_heads'] = 32
@@ -37,6 +40,14 @@ def load_model_config(model_id):
37
  model_config['max_position_embeddings'] = 2048
38
  model_config['layernorm_operation'] = 2
39
 
 
 
 
 
 
 
 
 
40
  st.session_state['model_config'] = model_config
41
  return model_config
42
 
@@ -45,7 +56,6 @@ subtotal_parameters = [
45
  'embedding_weights',
46
  'attention_weights',
47
  'mlp_weights',
48
- 'model_total_size (Byte)'
49
  ]
50
 
51
  subtotal_operations = [
@@ -57,7 +67,7 @@ subtotal_operations = [
57
 
58
 
59
 
60
- col1, col2, col3, col4, col5 = st.columns([1,1.5,2.5,2.5,0.1])
61
 
62
  inference_config = {}
63
  parameter_count = {}
@@ -98,27 +108,16 @@ with col1:
98
  st.write(f"arithmetic_intensity: {gpu_config['arithmetic_intensity']:.3f}")
99
 
100
  with col2:
101
- parameter_count['word_embedding'] = model_config['vocab_size']*model_config['hidden_size']
102
- parameter_count['positional_embedding'] = model_config['max_position_embeddings']*model_config['hidden_size']
103
-
104
- parameter_count['attention_Q'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
105
- parameter_count['attention_K'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
106
- parameter_count['attention_V'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
107
- parameter_count['attention_out'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
108
-
109
- parameter_count['layernorm'] = 2*model_config['layernorm_operation']*model_config['num_hidden_layers']*model_config['hidden_size']
110
- parameter_count['mlp1'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
111
- parameter_count['mlp2'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
112
- parameter_count['embedding_weights'] = parameter_count['word_embedding'] + parameter_count['positional_embedding']
113
- parameter_count['attention_weights'] = parameter_count['attention_out'] + parameter_count['attention_Q'] + parameter_count['attention_K'] + parameter_count['attention_V']
114
- parameter_count['mlp_weights'] = parameter_count['mlp1'] + parameter_count['mlp2']
115
- parameter_count['model_total_size (Byte)'] = inference_config['byte_per_parameter'] * (
116
- parameter_count['embedding_weights'] +
117
- parameter_count['attention_weights'] +
118
- parameter_count['mlp_weights'] +
119
- parameter_count['layernorm'])
120
-
121
-
122
 
123
  parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key not in subtotal_parameters}
124
  subtotal_parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key in subtotal_parameters}
@@ -133,6 +132,25 @@ with col2:
133
  header4("Parameters Summary")
134
  st.markdown(create_table(df_subtotal_parameters_items))
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  with col3: # Prefilling
138
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
 
2
 
3
  import streamlit as st
4
  import pandas as pd
5
+ from model_util import fetch_dictionary_content, load_parameter, get_model, classify_module, get_module_tensors
6
  from calc_util import *
7
  from render_util import create_table, header4, header5
8
 
 
15
  def load_model_config(model_id):
16
  if 'model_id' in st.session_state['model_config'] and st.session_state['model_config']['model_id'] == model_id:
17
  return st.session_state['model_config']
18
+ if 'parameter_count' in st.session_state:
19
+ st.session_state.pop('parameter_count')
20
+
21
  model_config = {}
22
  dictionary_content = fetch_dictionary_content(model_id)
23
  if dictionary_content:
 
30
  model_config['max_position_embeddings'] = dictionary_content['max_position_embeddings']
31
  model_config['layernorm_operation'] = 2
32
  else:
33
+ st.warning("Fetching information failed! Maybe model info is not public!")
34
  model_config['model_id'] = 'opt-1.3b'
35
  model_config['hidden_size'] = 2048
36
  model_config['num_attention_heads'] = 32
 
40
  model_config['max_position_embeddings'] = 2048
41
  model_config['layernorm_operation'] = 2
42
 
43
+ try:
44
+ model_config['model'] = get_model(model_id, None, None)
45
+ module_tensors = get_module_tensors(model_config['model'])
46
+ model_config['module_classes'] = classify_module(module_tensors)
47
+ except:
48
+ model_config['model'] = None
49
+ model_config['module_classes'] = None
50
+
51
  st.session_state['model_config'] = model_config
52
  return model_config
53
 
 
56
  'embedding_weights',
57
  'attention_weights',
58
  'mlp_weights',
 
59
  ]
60
 
61
  subtotal_operations = [
 
67
 
68
 
69
 
70
+ col1, col2, col3, col4, col5 = st.columns([0.8, 2, 2.5, 2.5, 0.01])
71
 
72
  inference_config = {}
73
  parameter_count = {}
 
108
  st.write(f"arithmetic_intensity: {gpu_config['arithmetic_intensity']:.3f}")
109
 
110
  with col2:
111
+ if 'parameter_count' not in st.session_state:
112
+ if model_config['model']:
113
+ st.info("Model info fetcted!")
114
+ parameter_count = calc_model_size_from_model(model_config, inference_config)
115
+ else:
116
+ st.info("Fail to fetch model info. Using estimation!")
117
+ parameter_count = model_size_estimate(model_config, inference_config)
118
+ st.session_state.parameter_count = parameter_count
119
+ else:
120
+ parameter_count = st.session_state.parameter_count
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key not in subtotal_parameters}
123
  subtotal_parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key in subtotal_parameters}
 
132
  header4("Parameters Summary")
133
  st.markdown(create_table(df_subtotal_parameters_items))
134
 
135
+ model_total_size_in_byte = inference_config['byte_per_parameter'] * (
136
+ parameter_count['embedding_weights'] +
137
+ parameter_count['attention_weights'] +
138
+ parameter_count['mlp_weights'] +
139
+ parameter_count['layernorm']
140
+ )
141
+ st.write(f'model_total_size (Byte): {model_total_size_in_byte:,}')
142
+
143
+
144
+ # add parameter viewer
145
+ if model_config['model']:
146
+ header4("Parameters Viewer")
147
+ weight_generic = st.selectbox('Select weight:', options=model_config['module_classes'])
148
+ modules = {}
149
+ for module in model_config['module_classes'][weight_generic]:
150
+ modules.update(module)
151
+ modules = {k: list(v) for k, v in modules.items()}
152
+ modules = pd.DataFrame(list(modules.items()), columns=["Parameter", "Shape"])
153
+ st.markdown(create_table(modules))
154
 
155
  with col3: # Prefilling
156
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
calc_util.py CHANGED
@@ -1,5 +1,47 @@
1
  import numpy as np
2
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def multiplication_in_int64(array):
5
  return np.cumprod(np.array(array, dtype=np.int64))[-1]
@@ -19,28 +61,76 @@ def word_embedding_operation(model_config, inference_config):
19
  #The resultant matrix after the multiplication will be of size \( B \times s \times d_{model} \).
20
  #For each element in this resultant matrix, the number of FLOPs required is \( 2 \times n_{vocab} \). This is because for a single element in the output matrix, we have \( 2N \) FLOPs (with \( N \) being the common dimension), leading to the matrix multiplication FLOP count as:
21
  #\begin{equation}
22
- #2 \times B \times s \times n_{vocab} \times d_{model}
23
  #\end{equation}
 
 
 
 
 
 
 
24
  A = [inference_config['batchsize'], inference_config['input_seq_length'], model_config['vocab_size']]
25
  B = [model_config['vocab_size'], model_config['hidden_size']]
26
- return matrix_operation(A, B)
 
27
 
28
 
29
  def positional_embedding_operation(model_config, inference_config):
 
 
 
 
30
  return multiplication_in_int64([inference_config['batchsize'], inference_config['input_seq_length'], model_config['hidden_size']])
31
 
32
  ### Below three are the same
33
  def attention_K_operation(model_config, inference_config, seq_length):
 
 
 
 
 
 
 
 
 
 
 
 
34
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
35
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
36
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
37
 
38
  def attention_Q_operation(model_config, inference_config, seq_length):
 
 
 
 
 
 
 
 
 
 
 
 
39
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
40
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
41
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
42
 
43
  def attention_V_operation(model_config, inference_config, seq_length):
 
 
 
 
 
 
 
 
 
 
 
 
44
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
45
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
46
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
@@ -49,7 +139,7 @@ def attention_V_operation(model_config, inference_config, seq_length):
49
  def attention_QK_operation(model_config, inference_config, seq_length_Q, seq_length_K):
50
  A = [inference_config['batchsize'], seq_length_Q, model_config['hidden_size_per_head']]
51
  B = [model_config['hidden_size_per_head'], seq_length_K]
52
- return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
53
 
54
  def attention_softmax_operation(model_config, inference_config,seq_length):
55
  # Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).
@@ -63,6 +153,18 @@ def attention_multV_operation(model_config, inference_config, seq_length_Q, seq_
63
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
64
 
65
  def attention_out_operation(model_config, inference_config, seq_length):
 
 
 
 
 
 
 
 
 
 
 
 
66
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
67
  B = [model_config['hidden_size'], model_config['hidden_size']]
68
  return model_config['num_hidden_layers'] * matrix_operation(A, B)
@@ -70,19 +172,34 @@ def attention_out_operation(model_config, inference_config, seq_length):
70
  def layernorm_operation(model_config, inference_config, seq_length):
71
  # Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).
72
  # 5 is a modeled value
 
 
 
 
 
 
 
73
  layernorm_operation = (5*inference_config['batchsize']*seq_length*model_config['hidden_size'])
74
  return model_config['num_hidden_layers'] * model_config['layernorm_operation'] * layernorm_operation
75
 
76
 
77
- def mlp1_operation(model_config, inference_config, seq_length):
 
 
 
 
 
 
 
 
 
 
 
 
78
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
79
  B = [model_config['hidden_size'], model_config['intermediate_size']]
80
- return model_config['num_hidden_layers'] * matrix_operation(A, B)
81
 
82
- def mlp2_operation(model_config, inference_config, seq_length):
83
- A = [inference_config['batchsize'], seq_length, model_config['intermediate_size']]
84
- B = [model_config['intermediate_size'], model_config['hidden_size']]
85
- return model_config['num_hidden_layers'] * matrix_operation(A, B)
86
 
87
  def prefilling_operation(model_config, inference_config):
88
  prefilling_operation_count = {}
@@ -99,12 +216,10 @@ def prefilling_operation(model_config, inference_config):
99
 
100
  prefilling_operation_count['layernorm'] =layernorm_operation(model_config, inference_config, inference_config['input_seq_length'])
101
 
102
- prefilling_operation_count['mlp1'] = mlp1_operation(model_config, inference_config, inference_config['input_seq_length'])
103
- prefilling_operation_count['mlp2'] = mlp2_operation(model_config, inference_config, inference_config['input_seq_length'])
104
 
105
  prefilling_operation_count['embeddings'] = prefilling_operation_count['word_embedding'] + prefilling_operation_count['positional_embedding']
106
  prefilling_operation_count['attention'] = sum([v for k,v in prefilling_operation_count.items() if 'attention' in k])
107
- prefilling_operation_count['mlp'] = prefilling_operation_count['mlp1'] + prefilling_operation_count['mlp2']
108
  prefilling_operation_count['total'] = (prefilling_operation_count['embeddings'] + prefilling_operation_count['attention'] + prefilling_operation_count['mlp'] + prefilling_operation_count['layernorm'])
109
 
110
  return prefilling_operation_count
@@ -120,8 +235,7 @@ def generation_operation(model_config, inference_config):
120
  generation_operation_count['attention_softmax'] = 0
121
  generation_operation_count['attention_multV'] = 0
122
  generation_operation_count['attention_out'] = 0
123
- generation_operation_count['mlp1'] = 0
124
- generation_operation_count['mlp2'] = 0
125
  generation_operation_count['layernorm'] = 0
126
 
127
  for t in range(inference_config['output_seq_length']):
@@ -133,8 +247,7 @@ def generation_operation(model_config, inference_config):
133
  generation_operation_count['attention_softmax'] += attention_softmax_operation(model_config, inference_config, 1)
134
  generation_operation_count['attention_multV'] += attention_multV_operation(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
135
  generation_operation_count['attention_out'] += attention_out_operation(model_config, inference_config, 1)
136
- generation_operation_count['mlp1'] += mlp1_operation(model_config, inference_config, 1)
137
- generation_operation_count['mlp2'] += mlp2_operation(model_config, inference_config, 1)
138
  else:
139
  generation_operation_count['attention_K'] += attention_K_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
140
  generation_operation_count['attention_V'] += attention_V_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
@@ -143,14 +256,12 @@ def generation_operation(model_config, inference_config):
143
  generation_operation_count['attention_softmax'] += attention_softmax_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
144
  generation_operation_count['attention_multV'] += attention_multV_operation(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
145
  generation_operation_count['attention_out'] += attention_out_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
146
- generation_operation_count['mlp1'] += mlp1_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
147
- generation_operation_count['mlp2'] += mlp2_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
148
 
149
  generation_operation_count['layernorm'] += layernorm_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
150
 
151
  generation_operation_count['embeddings'] = generation_operation_count['word_embedding'] + generation_operation_count['positional_embedding']
152
  generation_operation_count['attention'] = sum([v for k,v in generation_operation_count.items() if 'attention' in k])
153
- generation_operation_count['mlp'] = generation_operation_count['mlp1'] + generation_operation_count['mlp2']
154
  generation_operation_count['total'] = (generation_operation_count['attention'] + generation_operation_count['mlp'] + generation_operation_count['layernorm'])
155
 
156
  return generation_operation_count
@@ -196,12 +307,9 @@ def layernorm_activation_memory(model_config, inference_config, seq_length):
196
  per_layernorm_per_layer = 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size']
197
  return model_config['num_hidden_layers'] * model_config['layernorm_operation'] * per_layernorm_per_layer
198
 
199
- def mlp1_activation_memory(model_config, inference_config, seq_length):
200
- per_layer = inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['intermediate_size'])
201
- return model_config['num_hidden_layers'] * per_layer
202
-
203
- def mlp2_activation_memory(model_config, inference_config, seq_length):
204
- per_layer = inference_config['batchsize'] * seq_length * (model_config['intermediate_size'] + model_config['hidden_size'])
205
  return model_config['num_hidden_layers'] * per_layer
206
 
207
  def prefilling_activation_memory(model_config, inference_config):
@@ -220,8 +328,7 @@ def prefilling_activation_memory(model_config, inference_config):
220
 
221
  activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
222
 
223
- activation_memory['mlp1'] = mlp1_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
224
- activation_memory['mlp2'] = mlp2_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
225
 
226
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
227
  activation_memory['attention'] = (
@@ -230,7 +337,6 @@ def prefilling_activation_memory(model_config, inference_config):
230
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
231
  activation_memory['attention_out']
232
  )
233
- activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
234
  activation_memory['total'] = (
235
  activation_memory['embeddings'] + activation_memory['attention'] +
236
  activation_memory['mlp'] + activation_memory['layernorm']
@@ -238,7 +344,6 @@ def prefilling_activation_memory(model_config, inference_config):
238
 
239
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
240
  activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
241
- activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
242
  activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
243
 
244
  return activation_memory
@@ -255,8 +360,7 @@ def generation_activation_memory(model_config, inference_config):
255
  activation_memory['attention_softmax'] = 0
256
  activation_memory['attention_multV'] = 0
257
  activation_memory['attention_out'] = 0
258
- activation_memory['mlp1'] = 0
259
- activation_memory['mlp2'] = 0
260
  activation_memory['layernorm'] = 0
261
 
262
  for t in range(inference_config['output_seq_length']):
@@ -268,8 +372,7 @@ def generation_activation_memory(model_config, inference_config):
268
  activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, 1)
269
  activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
270
  activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, 1)
271
- activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, 1)
272
- activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, 1)
273
  else:
274
  activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
275
  activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
@@ -278,8 +381,7 @@ def generation_activation_memory(model_config, inference_config):
278
  activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
279
  activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
280
  activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
281
- activation_memory['mlp1'] += mlp1_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
282
- activation_memory['mlp2'] += mlp2_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
283
 
284
  activation_memory['layernorm'] += layernorm_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
285
 
@@ -290,7 +392,6 @@ def generation_activation_memory(model_config, inference_config):
290
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
291
  activation_memory['attention_out']
292
  )
293
- activation_memory['mlp'] = activation_memory['mlp1'] + activation_memory['mlp2']
294
  activation_memory['total'] = (
295
  activation_memory['embeddings'] + activation_memory['attention'] +
296
  activation_memory['mlp'] + activation_memory['layernorm']
 
1
  import numpy as np
2
+ from collections import defaultdict
3
+ from functools import partial
4
+ from typing import List
5
+ from model_util import get_module_tensors_matched
6
+
7
+ def calc_model_size_from_model(model_config, inference_config):
8
+ get_module_tensors_matched_partial = partial(get_module_tensors_matched, module_classes_dict = model_config['module_classes'])
9
+
10
+ parameter_count = defaultdict(float)
11
+ parameter_count['word_embedding'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'embed' in x and 'pos' not in x)])
12
+ parameter_count['positional_embedding'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'embed' in x and 'pos' in x)])
13
+
14
+ parameter_count['attention_Q'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'att' in x and 'q' in x)])
15
+ parameter_count['attention_K'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'att' in x and 'k' in x)])
16
+ parameter_count['attention_V'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'att' in x and 'v' in x)])
17
+ parameter_count['attention_out'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'att' in x and ('out_' in x or 'o_' in x))])
18
+
19
+ parameter_count['layernorm'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'norm' in x)])
20
+ parameter_count['mlp_weights'] = sum([v.numel() for v in get_module_tensors_matched_partial(lambda x: 'fc' in x or 'mlp' in x)])
21
+
22
+ parameter_count['embedding_weights'] = parameter_count['word_embedding'] + parameter_count['positional_embedding']
23
+ parameter_count['attention_weights'] = parameter_count['attention_out'] + parameter_count['attention_Q'] + parameter_count['attention_K'] + parameter_count['attention_V']
24
+
25
+ return parameter_count
26
+
27
+ def model_size_estimate(model_config, inference_config):
28
+ parameter_count = {}
29
+ parameter_count['word_embedding'] = model_config['vocab_size']*model_config['hidden_size']
30
+ parameter_count['positional_embedding'] = model_config['max_position_embeddings']*model_config['hidden_size']
31
+
32
+ parameter_count['attention_Q'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
33
+ parameter_count['attention_K'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
34
+ parameter_count['attention_V'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
35
+ parameter_count['attention_out'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
36
+
37
+ parameter_count['layernorm'] = 2*model_config['layernorm_operation']*model_config['num_hidden_layers']*model_config['hidden_size']
38
+ parameter_count['mlp1'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
39
+ parameter_count['mlp2'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
40
+ parameter_count['embedding_weights'] = parameter_count['word_embedding'] + parameter_count['positional_embedding']
41
+ parameter_count['attention_weights'] = parameter_count['attention_out'] + parameter_count['attention_Q'] + parameter_count['attention_K'] + parameter_count['attention_V']
42
+ parameter_count['mlp_weights'] = parameter_count['mlp1'] + parameter_count['mlp2']
43
+
44
+ return parameter_count
45
 
46
  def multiplication_in_int64(array):
47
  return np.cumprod(np.array(array, dtype=np.int64))[-1]
 
61
  #The resultant matrix after the multiplication will be of size \( B \times s \times d_{model} \).
62
  #For each element in this resultant matrix, the number of FLOPs required is \( 2 \times n_{vocab} \). This is because for a single element in the output matrix, we have \( 2N \) FLOPs (with \( N \) being the common dimension), leading to the matrix multiplication FLOP count as:
63
  #\begin{equation}
64
+ #2 \times B \times s \times n_{v ocab} \times d_{model}
65
  #\end{equation}
66
+ if model_config['module_classes']:
67
+ modules = get_module_tensors_matched(lambda x: 'embed' in x and 'pos' not in x, model_config['module_classes'])
68
+ A = [inference_config['batchsize'], inference_config['input_seq_length'], modules[0][0]]
69
+ B = modules[0]
70
+ op_count = matrix_operation(A, B)
71
+ return op_count
72
+
73
  A = [inference_config['batchsize'], inference_config['input_seq_length'], model_config['vocab_size']]
74
  B = [model_config['vocab_size'], model_config['hidden_size']]
75
+ op_count = matrix_operation(A, B)
76
+ return op_count
77
 
78
 
79
  def positional_embedding_operation(model_config, inference_config):
80
+ if model_config['module_classes']:
81
+ modules = get_module_tensors_matched(lambda x: 'embed' in x and 'pos' in x, model_config['module_classes'])
82
+ return multiplication_in_int64([inference_config['batchsize'], inference_config['input_seq_length'], modules[0][-1]])
83
+
84
  return multiplication_in_int64([inference_config['batchsize'], inference_config['input_seq_length'], model_config['hidden_size']])
85
 
86
  ### Below three are the same
87
  def attention_K_operation(model_config, inference_config, seq_length):
88
+ if model_config['module_classes']:
89
+ modules = get_module_tensors_matched(lambda x: 'att' in x and 'k' in x , model_config['module_classes'])
90
+ total = 0
91
+ for module in modules:
92
+ if len(module) > 1:
93
+ A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
94
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
95
+ total += model_config['num_attention_heads']*matrix_operation(A, B)
96
+ else:
97
+ total += model_config['hidden_size']
98
+ return total
99
+
100
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
101
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
102
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
103
 
104
  def attention_Q_operation(model_config, inference_config, seq_length):
105
+ if model_config['module_classes']:
106
+ modules = get_module_tensors_matched(lambda x: 'att' in x and 'q' in x , model_config['module_classes'])
107
+ total = 0
108
+ for module in modules:
109
+ if len(module) > 1:
110
+ A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
111
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
112
+ total += model_config['num_attention_heads']*matrix_operation(A, B)
113
+ else:
114
+ total += model_config['hidden_size']
115
+ return total
116
+
117
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
118
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
119
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
120
 
121
  def attention_V_operation(model_config, inference_config, seq_length):
122
+ if model_config['module_classes']:
123
+ modules = get_module_tensors_matched(lambda x: 'att' in x and 'v' in x , model_config['module_classes'])
124
+ total = 0
125
+ for module in modules:
126
+ if len(module) > 1:
127
+ A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
128
+ B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
129
+ total += model_config['num_attention_heads']*matrix_operation(A, B)
130
+ else:
131
+ total += model_config['hidden_size']
132
+ return total
133
+
134
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
135
  B = [model_config['hidden_size'], model_config['hidden_size_per_head']]
136
  return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
 
139
  def attention_QK_operation(model_config, inference_config, seq_length_Q, seq_length_K):
140
  A = [inference_config['batchsize'], seq_length_Q, model_config['hidden_size_per_head']]
141
  B = [model_config['hidden_size_per_head'], seq_length_K]
142
+ return model_config['num_hidden_layers'] * model_config['num_attention_heads'] * matrix_operation(A, B)
143
 
144
  def attention_softmax_operation(model_config, inference_config,seq_length):
145
  # Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).
 
153
  return model_config['num_hidden_layers'] * model_config['num_attention_heads']* matrix_operation(A, B)
154
 
155
  def attention_out_operation(model_config, inference_config, seq_length):
156
+ if model_config['module_classes']:
157
+ modules = get_module_tensors_matched(lambda x: 'att' in x and 'k' in x , model_config['module_classes'])
158
+ total = 0
159
+ for module in modules:
160
+ if len(module) > 1:
161
+ A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
162
+ B = [model_config['hidden_size'], model_config['hidden_size']]
163
+ total += matrix_operation(A, B)
164
+ else:
165
+ total += model_config['hidden_size']
166
+ return total
167
+
168
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
169
  B = [model_config['hidden_size'], model_config['hidden_size']]
170
  return model_config['num_hidden_layers'] * matrix_operation(A, B)
 
172
  def layernorm_operation(model_config, inference_config, seq_length):
173
  # Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).
174
  # 5 is a modeled value
175
+ if model_config['module_classes']:
176
+ modules = get_module_tensors_matched(lambda x: 'norm' in x, model_config['module_classes'])
177
+ total = 0
178
+ for module in modules:
179
+ total += model_config['hidden_size']
180
+ return 5*total
181
+
182
  layernorm_operation = (5*inference_config['batchsize']*seq_length*model_config['hidden_size'])
183
  return model_config['num_hidden_layers'] * model_config['layernorm_operation'] * layernorm_operation
184
 
185
 
186
+ def mlp_operation(model_config, inference_config, seq_length):
187
+ if model_config['module_classes']:
188
+ modules = get_module_tensors_matched(lambda x: 'fc' in x or 'mlp' in x, model_config['module_classes'])
189
+ total = 0
190
+ for module in modules:
191
+ if len(module) > 1:
192
+ A = [inference_config['batchsize'], seq_length, module[1]]
193
+ B = [module[1], module[0]]
194
+ total += matrix_operation(A, B)
195
+ else:
196
+ total += modules[-1][0]
197
+ return total
198
+
199
  A = [inference_config['batchsize'], seq_length, model_config['hidden_size']]
200
  B = [model_config['hidden_size'], model_config['intermediate_size']]
201
+ return model_config['num_hidden_layers'] * (2*matrix_operation(A, B))
202
 
 
 
 
 
203
 
204
  def prefilling_operation(model_config, inference_config):
205
  prefilling_operation_count = {}
 
216
 
217
  prefilling_operation_count['layernorm'] =layernorm_operation(model_config, inference_config, inference_config['input_seq_length'])
218
 
219
+ prefilling_operation_count['mlp'] = mlp_operation(model_config, inference_config, inference_config['input_seq_length'])
 
220
 
221
  prefilling_operation_count['embeddings'] = prefilling_operation_count['word_embedding'] + prefilling_operation_count['positional_embedding']
222
  prefilling_operation_count['attention'] = sum([v for k,v in prefilling_operation_count.items() if 'attention' in k])
 
223
  prefilling_operation_count['total'] = (prefilling_operation_count['embeddings'] + prefilling_operation_count['attention'] + prefilling_operation_count['mlp'] + prefilling_operation_count['layernorm'])
224
 
225
  return prefilling_operation_count
 
235
  generation_operation_count['attention_softmax'] = 0
236
  generation_operation_count['attention_multV'] = 0
237
  generation_operation_count['attention_out'] = 0
238
+ generation_operation_count['mlp'] = 0
 
239
  generation_operation_count['layernorm'] = 0
240
 
241
  for t in range(inference_config['output_seq_length']):
 
247
  generation_operation_count['attention_softmax'] += attention_softmax_operation(model_config, inference_config, 1)
248
  generation_operation_count['attention_multV'] += attention_multV_operation(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
249
  generation_operation_count['attention_out'] += attention_out_operation(model_config, inference_config, 1)
250
+ generation_operation_count['mlp'] += mlp_operation(model_config, inference_config, 1)
 
251
  else:
252
  generation_operation_count['attention_K'] += attention_K_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
253
  generation_operation_count['attention_V'] += attention_V_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
 
256
  generation_operation_count['attention_softmax'] += attention_softmax_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
257
  generation_operation_count['attention_multV'] += attention_multV_operation(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
258
  generation_operation_count['attention_out'] += attention_out_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
259
+ generation_operation_count['mlp'] += mlp_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
 
260
 
261
  generation_operation_count['layernorm'] += layernorm_operation(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
262
 
263
  generation_operation_count['embeddings'] = generation_operation_count['word_embedding'] + generation_operation_count['positional_embedding']
264
  generation_operation_count['attention'] = sum([v for k,v in generation_operation_count.items() if 'attention' in k])
 
265
  generation_operation_count['total'] = (generation_operation_count['attention'] + generation_operation_count['mlp'] + generation_operation_count['layernorm'])
266
 
267
  return generation_operation_count
 
307
  per_layernorm_per_layer = 2 * inference_config['batchsize'] * seq_length * model_config['hidden_size']
308
  return model_config['num_hidden_layers'] * model_config['layernorm_operation'] * per_layernorm_per_layer
309
 
310
+ def mlp_activation_memory(model_config, inference_config, seq_length):
311
+ # two mlp layer
312
+ per_layer = 2 * inference_config['batchsize'] * seq_length * (model_config['hidden_size'] + model_config['intermediate_size'])
 
 
 
313
  return model_config['num_hidden_layers'] * per_layer
314
 
315
  def prefilling_activation_memory(model_config, inference_config):
 
328
 
329
  activation_memory['layernorm'] = layernorm_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
330
 
331
+ activation_memory['mlp'] = mlp_activation_memory(model_config, inference_config, inference_config['input_seq_length'])
 
332
 
333
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
334
  activation_memory['attention'] = (
 
337
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
338
  activation_memory['attention_out']
339
  )
 
340
  activation_memory['total'] = (
341
  activation_memory['embeddings'] + activation_memory['attention'] +
342
  activation_memory['mlp'] + activation_memory['layernorm']
 
344
 
345
  activation_memory['embeddings'] = activation_memory['word_embedding'] + activation_memory['positional_embedding']
346
  activation_memory['attention'] = sum([v for k,v in activation_memory.items() if 'attention' in k])
 
347
  activation_memory['total'] = (activation_memory['attention'] + activation_memory['mlp'] + activation_memory['layernorm'])
348
 
349
  return activation_memory
 
360
  activation_memory['attention_softmax'] = 0
361
  activation_memory['attention_multV'] = 0
362
  activation_memory['attention_out'] = 0
363
+ activation_memory['mlp'] = 0
 
364
  activation_memory['layernorm'] = 0
365
 
366
  for t in range(inference_config['output_seq_length']):
 
372
  activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, 1)
373
  activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=1, seq_length_V=(t+1)+inference_config['input_seq_length'])
374
  activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, 1)
375
+ activation_memory['mlp'] += mlp_activation_memory(model_config, inference_config, 1)
 
376
  else:
377
  activation_memory['attention_K'] += attention_K_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
378
  activation_memory['attention_V'] += attention_V_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
 
381
  activation_memory['attention_softmax'] += attention_softmax_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
382
  activation_memory['attention_multV'] += attention_multV_activation_memory(model_config, inference_config, seq_length_Q=(t+1)+inference_config['input_seq_length'], seq_length_V=(t+1)+inference_config['input_seq_length'])
383
  activation_memory['attention_out'] += attention_out_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
384
+ activation_memory['mlp'] += mlp_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
 
385
 
386
  activation_memory['layernorm'] += layernorm_activation_memory(model_config, inference_config, (t+1)+inference_config['input_seq_length'])
387
 
 
392
  activation_memory['attention_softmax'] + activation_memory['attention_multV'] +
393
  activation_memory['attention_out']
394
  )
 
395
  activation_memory['total'] = (
396
  activation_memory['embeddings'] + activation_memory['attention'] +
397
  activation_memory['mlp'] + activation_memory['layernorm']
model_util.py CHANGED
@@ -1,4 +1,12 @@
1
  import requests
 
 
 
 
 
 
 
 
2
 
3
 
4
  def fetch_dictionary_content(model_id):
@@ -15,4 +23,119 @@ def load_parameter(model_dict, cand_keys):
15
  for k in cand_keys:
16
  if k in model_dict:
17
  return model_dict[k]
18
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
+ import re
3
+ from collections import defaultdict
4
+ # Utilities related to loading in and working with models/specific models
5
+ from urllib.parse import urlparse
6
+ import torch
7
+ from accelerate.commands.estimate import check_has_model, create_empty_model
8
+ from accelerate.utils import compute_module_sizes, named_module_tensors
9
+ from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
10
 
11
 
12
  def fetch_dictionary_content(model_id):
 
23
  for k in cand_keys:
24
  if k in model_dict:
25
  return model_dict[k]
26
+ return 0
27
+
28
+ # Reference: https://huggingface.co/spaces/hf-accelerate/model-memory-usage
29
+ def extract_from_url(name: str):
30
+ "Checks if `name` is a URL, and if so converts it to a model name"
31
+ is_url = False
32
+ try:
33
+ result = urlparse(name)
34
+ is_url = all([result.scheme, result.netloc])
35
+ except Exception:
36
+ is_url = False
37
+ # Pass through if not a URL
38
+ if not is_url:
39
+ return name
40
+ else:
41
+ path = result.path
42
+ return path[1:]
43
+
44
+
45
+ def translate_llama2(text):
46
+ "Translates llama-2 to its hf counterpart"
47
+ if not text.endswith("-hf"):
48
+ return text + "-hf"
49
+ return text
50
+
51
+
52
+ def get_model(model_name: str, library: str, access_token: str):
53
+ "Finds and grabs model from the Hub, and initializes on `meta`"
54
+ if "meta-llama" in model_name:
55
+ model_name = translate_llama2(model_name)
56
+ if library == "auto":
57
+ library = None
58
+ model_name = extract_from_url(model_name)
59
+ try:
60
+ model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
61
+ except GatedRepoError:
62
+ raise RuntimeError(
63
+ f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
64
+ )
65
+ except RepositoryNotFoundError:
66
+ raise RuntimeError(f"Model `{model_name}` was not found on the Hub, please try another model name.")
67
+ except ValueError:
68
+ raise RuntimeError(
69
+ f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
70
+ )
71
+ except (RuntimeError, OSError) as e:
72
+ library = check_has_model(e)
73
+ if library != "unknown":
74
+ raise RuntimeError(
75
+ f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
76
+ )
77
+ raise RuntimeError(
78
+ f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
79
+ )
80
+ except ImportError:
81
+ # hacky way to check if it works with `trust_remote_code=False`
82
+ model = create_empty_model(
83
+ model_name, library_name=library, trust_remote_code=False, access_token=access_token
84
+ )
85
+ except Exception as e:
86
+ raise RuntimeError(
87
+ f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
88
+ )
89
+ return model
90
+
91
+ def get_module_tensors(model):
92
+ module_tensors = {}
93
+ for name, tensor in named_module_tensors(model, recurse=True):
94
+ module_tensors[name] = tensor.shape
95
+
96
+ return module_tensors
97
+
98
+
99
+ def classify_module(module_tensors):
100
+ # A dictionary to store counts for each generic layer type
101
+ module_classes = defaultdict(list)
102
+
103
+ # This function removes all numbers from a given string
104
+ def remove_numbers(s):
105
+ return re.sub(r'\d+', '', s)
106
+
107
+ # Loop through all named parameters of the model
108
+ for name in module_tensors:
109
+ # Remove numbers from the name
110
+ generic_name = remove_numbers(name)
111
+ generic_name = generic_name.replace('..', '.')
112
+
113
+ # If the name already exists in the dictionary, increase the count, else set it to 1
114
+ module_classes[generic_name].append({name: module_tensors[name]})
115
+
116
+ return module_classes
117
+
118
+ def get_module_tensors_matched(filter_fn, module_classes_dict):
119
+ matched = []
120
+ for generic, module_list in module_classes_dict.items():
121
+ if filter_fn(generic.lower()):
122
+ matched.extend([v for module in module_list for v in module.values()])
123
+
124
+ return matched
125
+
126
+
127
+ if __name__ == '__main__':
128
+ model = get_model('NousResearch/Nous-Hermes-Llama2-13b', None, None)
129
+ module_tensors = get_module_tensors(model)
130
+ module_classes = classify_module(module_tensors)
131
+ sizes = compute_module_sizes(model, dtype=torch.int8)
132
+ size_dict = {
133
+ 'attn':0,
134
+ 'mlp':0,
135
+ 'embed':0,
136
+ }
137
+ for k, v in sizes.items():
138
+ for kk in size_dict:
139
+ if kk in k and 'weight' in k:
140
+ size_dict[kk] += v/1024**3
141
+ print(sizes)