Spaces:

henryholloway
/

LLM-Inference-Calculator

Sleeping

App Files Files Community

henryholloway commited on May 17, 2024

Commit

c4f69f6

1 Parent(s): 475bc5f

Updated calculations, sources cited

Browse files

Files changed (1) hide show

app.py +13 -29

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ precision_options = {
     'mixed': 6,
     'half': 2
 }
 def calculate_memory_usage(parameter_count, context_length, data_type, batch_size, vocab_size, precision):
     # Convert bit size to byte size
     byte_size = quantization_bit_sizes[data_type] / 8
@@ -37,11 +37,8 @@ def calculate_memory_usage(parameter_count, context_length, data_type, batch_siz
     # Memory usage for context (activations)
     activations = calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision)
-    # Outputs memory usage
-    outputs = 4 * batch_size * context_length * vocab_size
     # Total memory usage
-    total_memory_usage = memory_params + activations + outputs
     # Convert bytes to gigabytes
     total_memory_usage_gb = total_memory_usage / (1024 ** 3)
@@ -49,33 +46,20 @@ def calculate_memory_usage(parameter_count, context_length, data_type, batch_siz
     return total_memory_usage_gb
 def calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision):
-    # Simplified activation calculation
-    hidden_size = parameter_count ** 0.5  # assuming a square root relationship for hidden size
-    num_attention_heads = 16  # a typical number of attention heads
-    intermediate_size = hidden_size * 4  # common in transformers
-    bytes_per_param = precision_options[precision] / 8
-    attention_input = bytes_per_param * batch_size * context_length * hidden_size
-    q = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
-    k = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
-    softmax_output = bytes_per_param * batch_size * num_attention_heads * (context_length ** 2)
-    v = bytes_per_param * batch_size * context_length * (hidden_size / num_attention_heads) * num_attention_heads
-    out_proj_input = bytes_per_param * batch_size * context_length * hidden_size
-    attention_block = attention_input + q + k + softmax_output + v + out_proj_input
-    mlp_input = bytes_per_param * batch_size * context_length * hidden_size
-    activation_input = bytes_per_param * batch_size * context_length * intermediate_size
-    down_proj_input = bytes_per_param * batch_size * context_length * intermediate_size
-    mlp_block = mlp_input + activation_input + down_proj_input
-    layer_norms = bytes_per_param * batch_size * context_length * hidden_size * 2
-    layer = attention_block + mlp_block + layer_norms
-    activations = layer  # assuming 12 layers for simplicity
-    return activations
 # Streamlit app
 st.title("Memory Usage Calculator for Large Language Models")

     'mixed': 6,
     'half': 2
 }
+# Taken from "Reducing Activation Recomputation in Large Transformer Models" https://arxiv.org/abs/2205.05198
 def calculate_memory_usage(parameter_count, context_length, data_type, batch_size, vocab_size, precision):
     # Convert bit size to byte size
     byte_size = quantization_bit_sizes[data_type] / 8
     # Memory usage for context (activations)
     activations = calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision)
     # Total memory usage
+    total_memory_usage = memory_params + activations
     # Convert bytes to gigabytes
     total_memory_usage_gb = total_memory_usage / (1024 ** 3)
     return total_memory_usage_gb
 def calculate_activations(parameter_count, context_length, batch_size, vocab_size, precision):
+    # Constants from the paper
+    layers = 32  # assuming 32 layers for the model
+    attention_heads = 32  # assuming 32 attention heads
+    hidden_dimensions = int(parameter_count ** 0.5)  # assuming square root relationship for hidden size
+    # Calculate activations based on the formula from the paper
+    activations_per_layer = context_length * batch_size * hidden_dimensions * (34 + ((5 * attention_heads * context_length) / hidden_dimensions))
+    activations = layers * activations_per_layer / 2  # divided by 2 as per the paper's calculation at 16bit precision
+    # Convert activations to bytes based on the precision
+    bytes_per_param = precision_options[precision] / 8
+    total_activations = bytes_per_param * activations
+    return total_activations
 # Streamlit app
 st.title("Memory Usage Calculator for Large Language Models")