Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import AutoConfig # Required for Hugging Face integration | |
from calc_params import calc_params # Import calc_params from the new file | |
import math | |
# ---- Helper Functions ---- # | |
def get_hf_model_args(hf_model_name_or_path): | |
try: | |
config = AutoConfig.from_pretrained(hf_model_name_or_path, trust_remote_code=True).to_dict() | |
except Exception as e: | |
raise gr.Error(f"Error fetching Hugging Face model: {str(e)}") | |
# Extract relevant values from the config | |
num_layers = config.get("num_hidden_layers", None) | |
hidden_size = config.get("hidden_size", None) | |
num_attention_heads = config.get("num_attention_heads", None) | |
vocab_size = config.get("vocab_size", None) | |
sequence_length = config.get("max_position_embeddings", None) | |
return { | |
"num_layers": num_layers, | |
"hidden_size": hidden_size, | |
"num_attention_heads": num_attention_heads, | |
"vocab_size": vocab_size, | |
"sequence_length": sequence_length, | |
} | |
# ---- Update Gradio inputs with Hugging Face model config ---- # | |
def update_from_hf_model(hf_model_name_or_path): | |
model_params = get_hf_model_args(hf_model_name_or_path) | |
return (gr.update(value=model_params["num_layers"]), | |
gr.update(value=model_params["hidden_size"]), | |
gr.update(value=model_params["num_attention_heads"]), | |
gr.update(value=model_params["vocab_size"]), | |
gr.update(value=model_params["sequence_length"]), | |
"") | |
# ---- Memory Calculation ---- # | |
def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib): | |
model_params = get_hf_model_args(hf_model_name_or_path) if hf_model_name_or_path else None | |
if model_params: | |
num_layers = model_params["num_layers"] or num_layers | |
hidden_size = model_params["hidden_size"] or hidden_size | |
num_attention_heads = model_params["num_attention_heads"] or num_attention_heads | |
vocab_size = model_params["vocab_size"] or vocab_size | |
sequence_length = model_params["sequence_length"] or sequence_length | |
dp_degree = num_gpus / (tensor_parallel_size * pipeline_parallel_size) | |
embed_params = 2 * vocab_size * hidden_size | |
positional_params = hidden_size * sequence_length | |
ln_params = 8 * hidden_size * num_layers + (2 * hidden_size) | |
attention_params = int(2 * (1 + ffn_expansion_factor) * num_layers * hidden_size * hidden_size) | |
mlp_params = ffn_expansion_factor * num_layers * hidden_size * hidden_size | |
total_params = embed_params + positional_params + ln_params + attention_params + mlp_params | |
bytes_per_param = 2 if is_mixed_precision else 4 | |
model_mem = total_params * bytes_per_param | |
per_gpu_mem_gib = (model_mem / (tensor_parallel_size * pipeline_parallel_size)) / 1024**3 + misc_mem_gib | |
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB" | |
# ---- FLOP Calculation ---- # | |
def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer): | |
# An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds) | |
tokens = 1e9 * tokens | |
# determine the flops factor. | |
iter_factor = 3 | |
if checkpoint_activations: | |
iter_factor += 1 | |
if infer: | |
iter_factor = 1 | |
qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size) | |
attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size | |
attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size | |
linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size | |
ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size | |
embedding_flops = 6 * tokens * hidden_size * vocab_size | |
if moe and topk > 1: | |
ffn_flops += ffn_flops * topk / expert_interval | |
if moe: | |
gating_flops = 2 * num_experts * hidden_size / expert_interval | |
total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops | |
if moe: | |
total_flops += gating_flops | |
def convert_flops(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) | |
s = round(params / p, 2) | |
return f"{s} {size_name[i]}" | |
return { | |
'qkv_flops': convert_flops(qkv_flops), | |
'attention_matrix_flops': convert_flops(attention_matrix_flops), | |
'attention_over_values_flops': convert_flops(attention_over_values_flops), | |
'linear_projection_flops': convert_flops(linear_projection_flops), | |
'ffn_flops': convert_flops(ffn_flops), | |
'embedding_flops': convert_flops(embedding_flops), | |
'total_flops': convert_flops(total_flops) | |
} | |
# ---- Gradio Interface ---- # | |
with gr.Blocks(theme='ysharma/TransformerCalculatorNew') as demo: | |
with gr.Accordion("Credits and General Idea", open=False): | |
gr.Markdown(""" | |
This app is a re-creation of [this calculator](https://github.com/EleutherAI/cookbook/tree/main/calc) from EleutherAI. | |
Before training or inference even begins, common practical questions about potential models must be answered such as: | |
1. How many parameters are we targeting? How should those parameters be allocated within the model? | |
1. How many FLOPs does the model from step 1 take to train on t tokens? How about inference? | |
1. How much memory does the model from step 1 take to train/infer on d devices? What memory-saving strategies (e.g. parallelism, quantization, etc) are necessary to fit the model on device memory? | |
""") | |
with gr.Tab("Memory Calculation"): | |
#with gr.TabItem("Memory Calculation"): | |
gr.Markdown(""" | |
## Memory Calculation | |
Memory Calculation calculates the amount of device memory required to train or infer a model. See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how memory overhead is calculated. | |
Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible. | |
Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error. | |
(Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are the [ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf). | |
""") | |
with gr.Accordion("How to use it?", open=False): | |
gr.Markdown(""" | |
## To Use | |
Fill in the required details below and click 'Calculate Memory' to get a result. | |
""") | |
with gr.Row(): | |
with gr.Column("Generatable"): | |
gr.Markdown("## Generatable") | |
with gr.Group(): | |
hf_model_name_or_path = gr.Textbox( | |
label="HuggingFace Model Name or Path", | |
info="Name of the HuggingFace Hub repository or the local file path for it" | |
) | |
sequence_length = gr.Number( | |
label="Sequence Length", | |
value=2048, | |
info="Sequence length used for training" | |
) | |
vocab_size = gr.Number( | |
label="Vocab Size", | |
value=51200, | |
info="How many tokens are in the embedding layer" | |
) | |
hidden_size = gr.Number( | |
label="Hidden Size", | |
value=6144, | |
info="Dimension of the model's hidden size" | |
) | |
num_attention_heads = gr.Number( | |
label="Number of Attention Heads", | |
value=64, | |
info="Number of attention heads used in the model" | |
) | |
num_layers = gr.Number( | |
label="Number of Layers", | |
value=44, | |
info="Number of transformer layers used in the model" | |
) | |
with gr.Column("User Defined"): | |
gr.Markdown("## User Defined") | |
num_gpus = gr.Number( | |
label="Number of GPUs", | |
value=1, | |
info="Number of GPUs used for training" | |
) | |
tensor_parallel_size = gr.Number( | |
label="Tensor Parallel Size", | |
value=1, | |
info="Tensor parallel degree (1 if not used)" | |
) | |
pipeline_parallel_size = gr.Number( | |
label="Pipeline Parallel Size", | |
value=1, | |
info="Pipeline parallel degree (1 if not used)" | |
) | |
batch_size_per_gpu = gr.Number( | |
label="Batch Size per GPU", | |
value=8, | |
info="Batch size per GPU" | |
) | |
ffn_expansion_factor = gr.Number( | |
label="FFN Expansion Factor", | |
value=4, | |
info="How much the MLP hidden size expands" | |
) | |
is_mixed_precision = gr.Checkbox( | |
label="Mixed Precision", | |
value=True, | |
info="Whether mixed precision is enabled" | |
) | |
misc_mem_gib = gr.Number( | |
label="Miscellaneous Memory Overhead (GiB)", | |
value=5, | |
info="Miscellaneous memory overhead per GPU by DL frameworks, communication libraries, etc." | |
) | |
calc_memory_button = gr.Button("Calculate Memory") | |
memory_result = gr.Textbox(label="Memory Calculation Result", interactive=False) | |
calc_memory_button.click( | |
calc_mem, | |
inputs=[ | |
hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib | |
], | |
outputs=memory_result | |
) | |
hf_model_name_or_path.change( | |
fn=update_from_hf_model, | |
inputs=[hf_model_name_or_path], | |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length, memory_result] | |
) | |
# Parameter Calculation Tab | |
with gr.TabItem("Parameter Calculation"): | |
gr.Markdown(""" | |
## Parameter Calculation | |
Parameter Calculation calculates the number of parameters present in a given model based on its hyperparams. | |
Such calculations are important to determine memory overheads, FLOPs, or to determine the size of an unknown transformer model. | |
We also found the following resources helpful: | |
[How does GPT-3 spend its 175B parameters?](https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters) | |
and [LLM Parameter Counting](https://kipp.ly/transformer-param-count/). | |
Note that this exists for `.safetensor` files in the explorer. | |
## How To Use | |
Simply input the model details, such as the hidden size, number of layers, and attention heads, and press 'Calculate Parameters' to get a result. | |
""") | |
with gr.Row(): | |
with gr.Column("Generatable"): | |
with gr.Group(): | |
hf_model_name_or_path = gr.Textbox( | |
label="HuggingFace Model Name or Path", | |
info="Name of the HuggingFace Hub repository or the local file path for it" | |
) | |
vocab_size = gr.Number( | |
label="Vocab Size", | |
value=51200, | |
info="How many tokens are in the embedding layer" | |
) | |
hidden_size = gr.Number( | |
label="Hidden Size", | |
value=6144, | |
info="Dimension of the model's hidden size" | |
) | |
sequence_length = gr.Number( | |
label="Sequence Length", | |
value=2048, | |
info="Sequence length used for training" | |
) | |
num_layers = gr.Number( | |
label="Number of Layers", | |
value=44, | |
info="Number of transformer layers used in the model" | |
) | |
with gr.Column("User Defined"): | |
tied_embeddings = gr.Checkbox( | |
label="Tied Embeddings", | |
value=False, | |
info="Whether embeddings are tied (shared between input and output)" | |
) | |
ffn_expansion_factor = gr.Number( | |
label="FFN Expansion Factor", | |
value=4, | |
info="How much the MLP hidden size expands" | |
) | |
num_mlp_linears = gr.Number( | |
label="Number of Linear Layers per MLP Block", | |
value=2, | |
info="How many linear layers per MLP block" | |
) | |
kv_size_ratio = gr.Number( | |
label="KV Size Ratio", | |
value=1.0, | |
info="Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA" | |
) | |
with gr.Accordion("MoE Parameters", open=False): | |
moe = gr.Checkbox( | |
label="MoE", | |
value=False, | |
info="Whether the model is MoE" | |
) | |
num_experts = gr.Number( | |
label="Number of Experts", | |
value=8, | |
info="Number of experts for MoE" | |
) | |
expert_interval = gr.Number( | |
label="Expert Interval", | |
value=1, | |
info="Expert interval for MoE" | |
) | |
topk = gr.Number( | |
label="Top k Routing", | |
value=1, | |
info="Top k routing for MoE" | |
) | |
calc_param_button = gr.Button("Calculate Parameters") | |
param_result = gr.Textbox(label="Parameter Calculation Result", interactive=False) | |
calc_param_button.click(calc_params, | |
inputs=[vocab_size, tied_embeddings, hidden_size, sequence_length, num_layers, moe, num_experts, expert_interval, topk, ffn_expansion_factor, num_mlp_linears, kv_size_ratio], | |
outputs=param_result) | |
hf_model_name_or_path.change(fn=update_from_hf_model, | |
inputs=[hf_model_name_or_path], | |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length]) | |
# New FLOP Calculation Tab | |
with gr.TabItem("FLOP Calculation"): | |
gr.Markdown(""" | |
## FLOP Calculation | |
FLOP Calculation calculates the number of theoretical FLOPs required to train a model on t tokens. | |
See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how FLOPs are calculated. | |
Other good resources that we consulted are the [Chinchilla Paper](https://arxiv.org/abs/2203.15556) and | |
[Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://people.eecs.berkeley.edu/~matei/papers/2021/sc_megatron_lm.pdf). | |
""") | |
with gr.Row(): | |
with gr.Column("Generatable"): | |
with gr.Group(): | |
hf_model_name_or_path = gr.Textbox( | |
label="HuggingFace Model Name or Path", | |
info="Name of the HuggingFace Hub repository or the local file path for it" | |
) | |
vocab_size = gr.Number( | |
label="Vocab Size", | |
value=51200, | |
info="How many tokens are in the embedding layer" | |
) | |
hidden_size = gr.Number( | |
label="Hidden Size", | |
value=6144, | |
info="Dimension of the model's hidden size" | |
) | |
sequence_length = gr.Number( | |
label="Sequence Length", | |
value=2048, | |
info="Sequence length used for training" | |
) | |
num_layers = gr.Number( | |
label="Number of Layers", | |
value=44, | |
info="Number of transformer layers used in the model" | |
) | |
with gr.Column("Generatable"): | |
kv_size_ratio = gr.Number( | |
label="KV Size Ratio", | |
value=1.0, | |
info="Ratio of kv heads to query heads used in model. 1.0 for MHA" | |
) | |
ffn_expansion_factor = gr.Number( | |
label="FFN Expansion Factor", | |
value=4, | |
info="How much the MLP hidden size expands" | |
) | |
batch_size = gr.Number( | |
label="Batch Size", | |
value=1, | |
info="Global batch size in units of samples" | |
) | |
tokens = gr.Number( | |
label="Number of GigaTokens", | |
value=300, | |
info="Total number of GigaTokens for training" | |
) | |
checkpoint_activations = gr.Checkbox( | |
label="Checkpoint Activations", | |
value=True, | |
info="Whether Megatron-style activation checkpointing is being used" | |
) | |
infer = gr.Checkbox( | |
label="Inference-Only", | |
value=False, | |
info="Whether the model is being used for inference-only" | |
) | |
# MoE parameters hidden in accordion | |
with gr.Accordion("Mixture of Experts (MoE)", open=False): | |
moe = gr.Checkbox( | |
label="Mixture of Experts (MoE)", | |
value=False, | |
info="Whether the model uses Mixture of Experts" | |
) | |
num_experts = gr.Number( | |
label="Number of Experts", | |
value=128, | |
info="Number of experts for Mixture of Experts (MoE)" | |
) | |
expert_interval = gr.Number( | |
label="Expert Interval", | |
value=2, | |
info="Expert interval for Mixture of Experts (MoE)" | |
) | |
topk = gr.Number( | |
label="Top K Routing for MoE", | |
value=1, | |
info="Top k routing for Mixture of Experts (MoE)" | |
) | |
calc_flops_button = gr.Button("Calculate FLOPs") | |
flops_result = gr.JSON(label="FLOP Calculation Result") | |
calc_flops_button.click( | |
calc_flops, | |
inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer], | |
outputs=flops_result | |
) | |
hf_model_name_or_path.change(fn=update_from_hf_model, | |
inputs=[hf_model_name_or_path], | |
outputs=[num_layers, hidden_size, vocab_size, sequence_length]) | |
demo.launch() | |