henryholloway commited on
Commit
c4f69f6
·
1 Parent(s): 475bc5f

Updated calculations, sources cited

Browse files
Files changed (1) hide show
  1. app.py +13 -29
app.py CHANGED
@@ -26,7 +26,7 @@ precision_options = {
26
  'mixed': 6,
27
  'half': 2
28
  }
29
-
30
  def calculate_memory_usage(parameter_count, context_length, data_type, batch_size, vocab_size, precision):
31
  # Convert bit size to byte size
32
  byte_size = quantization_bit_sizes[data_type] / 8
@@ -37,11 +37,8 @@ def calculate_memory_usage(parameter_count, context_length, data_type, batch_siz
37
  # Memory usage for context (activations)
38
  activations = calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision)
39
 
40
- # Outputs memory usage
41
- outputs = 4 * batch_size * context_length * vocab_size
42
-
43
  # Total memory usage
44
- total_memory_usage = memory_params + activations + outputs
45
 
46
  # Convert bytes to gigabytes
47
  total_memory_usage_gb = total_memory_usage / (1024 ** 3)
@@ -49,33 +46,20 @@ def calculate_memory_usage(parameter_count, context_length, data_type, batch_siz
49
  return total_memory_usage_gb
50
 
51
  def calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision):
52
- # Simplified activation calculation
53
- hidden_size = parameter_count ** 0.5 # assuming a square root relationship for hidden size
54
- num_attention_heads = 16 # a typical number of attention heads
55
- intermediate_size = hidden_size * 4 # common in transformers
56
-
57
- bytes_per_param = precision_options[precision] / 8
58
 
59
- attention_input = bytes_per_param * batch_size * context_length * hidden_size
60
- q = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
61
- k = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
62
- softmax_output = bytes_per_param * batch_size * num_attention_heads * (context_length ** 2)
63
- v = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
64
- out_proj_input = bytes_per_param * batch_size * context_length * hidden_size
65
- attention_block = attention_input + q + k + softmax_output + v + out_proj_input
66
 
67
- mlp_input = bytes_per_param * batch_size * context_length * hidden_size
68
- activation_input = bytes_per_param * batch_size * context_length * intermediate_size
69
- down_proj_input = bytes_per_param * batch_size * context_length * intermediate_size
70
- mlp_block = mlp_input + activation_input + down_proj_input
71
-
72
- layer_norms = bytes_per_param * batch_size * context_length * hidden_size * 2
73
-
74
- layer = attention_block + mlp_block + layer_norms
75
-
76
- activations = layer # assuming 12 layers for simplicity
77
 
78
- return activations
79
 
80
  # Streamlit app
81
  st.title("Memory Usage Calculator for Large Language Models")
 
26
  'mixed': 6,
27
  'half': 2
28
  }
29
+ # Taken from "Reducing Activation Recomputation in Large Transformer Models" https://arxiv.org/abs/2205.05198
30
  def calculate_memory_usage(parameter_count, context_length, data_type, batch_size, vocab_size, precision):
31
  # Convert bit size to byte size
32
  byte_size = quantization_bit_sizes[data_type] / 8
 
37
  # Memory usage for context (activations)
38
  activations = calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision)
39
 
 
 
 
40
  # Total memory usage
41
+ total_memory_usage = memory_params + activations
42
 
43
  # Convert bytes to gigabytes
44
  total_memory_usage_gb = total_memory_usage / (1024 ** 3)
 
46
  return total_memory_usage_gb
47
 
48
  def calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision):
49
+ # Constants from the paper
50
+ layers = 32 # assuming 32 layers for the model
51
+ attention_heads = 32 # assuming 32 attention heads
52
+ hidden_dimensions = int(parameter_count ** 0.5) # assuming square root relationship for hidden size
 
 
53
 
54
+ # Calculate activations based on the formula from the paper
55
+ activations_per_layer = context_length * batch_size * hidden_dimensions * (34 + ((5 * attention_heads * context_length) / hidden_dimensions))
56
+ activations = layers * activations_per_layer / 2 # divided by 2 as per the paper's calculation at 16bit precision
 
 
 
 
57
 
58
+ # Convert activations to bytes based on the precision
59
+ bytes_per_param = precision_options[precision] / 8
60
+ total_activations = bytes_per_param * activations
 
 
 
 
 
 
 
61
 
62
+ return total_activations
63
 
64
  # Streamlit app
65
  st.title("Memory Usage Calculator for Large Language Models")