|
import gradio as gr |
|
from transformers import AutoConfig |
|
from calc_params import calc_params |
|
import math |
|
|
|
|
|
def get_hf_model_args(hf_model_name_or_path): |
|
try: |
|
config = AutoConfig.from_pretrained(hf_model_name_or_path, trust_remote_code=True).to_dict() |
|
except Exception as e: |
|
raise gr.Error(f"Error fetching Hugging Face model: {str(e)}") |
|
|
|
|
|
num_layers = config.get("num_hidden_layers", None) |
|
hidden_size = config.get("hidden_size", None) |
|
num_attention_heads = config.get("num_attention_heads", None) |
|
vocab_size = config.get("vocab_size", None) |
|
sequence_length = config.get("max_position_embeddings", None) |
|
|
|
return { |
|
"num_layers": num_layers, |
|
"hidden_size": hidden_size, |
|
"num_attention_heads": num_attention_heads, |
|
"vocab_size": vocab_size, |
|
"sequence_length": sequence_length, |
|
} |
|
|
|
|
|
def update_from_hf_model(hf_model_name_or_path): |
|
model_params = get_hf_model_args(hf_model_name_or_path) |
|
|
|
return (gr.update(value=model_params["num_layers"]), |
|
gr.update(value=model_params["hidden_size"]), |
|
gr.update(value=model_params["num_attention_heads"]), |
|
gr.update(value=model_params["vocab_size"]), |
|
gr.update(value=model_params["sequence_length"]), |
|
"") |
|
|
|
|
|
def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib): |
|
model_params = get_hf_model_args(hf_model_name_or_path) if hf_model_name_or_path else None |
|
|
|
if model_params: |
|
num_layers = model_params["num_layers"] or num_layers |
|
hidden_size = model_params["hidden_size"] or hidden_size |
|
num_attention_heads = model_params["num_attention_heads"] or num_attention_heads |
|
vocab_size = model_params["vocab_size"] or vocab_size |
|
sequence_length = model_params["sequence_length"] or sequence_length |
|
|
|
dp_degree = num_gpus / (tensor_parallel_size * pipeline_parallel_size) |
|
embed_params = 2 * vocab_size * hidden_size |
|
positional_params = hidden_size * sequence_length |
|
ln_params = 8 * hidden_size * num_layers + (2 * hidden_size) |
|
attention_params = int(2 * (1 + ffn_expansion_factor) * num_layers * hidden_size * hidden_size) |
|
mlp_params = ffn_expansion_factor * num_layers * hidden_size * hidden_size |
|
total_params = embed_params + positional_params + ln_params + attention_params + mlp_params |
|
|
|
bytes_per_param = 2 if is_mixed_precision else 4 |
|
model_mem = total_params * bytes_per_param |
|
per_gpu_mem_gib = (model_mem / (tensor_parallel_size * pipeline_parallel_size)) / 1024**3 + misc_mem_gib |
|
|
|
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB" |
|
|
|
|
|
def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer): |
|
|
|
tokens = 1e9 * tokens |
|
|
|
iter_factor = 3 |
|
if checkpoint_activations: |
|
iter_factor += 1 |
|
if infer: |
|
iter_factor = 1 |
|
|
|
qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size) |
|
attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size |
|
attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size |
|
linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size |
|
ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size |
|
embedding_flops = 6 * tokens * hidden_size * vocab_size |
|
|
|
if moe and topk > 1: |
|
ffn_flops += ffn_flops * topk / expert_interval |
|
|
|
if moe: |
|
gating_flops = 2 * num_experts * hidden_size / expert_interval |
|
|
|
total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops |
|
|
|
if moe: |
|
total_flops += gating_flops |
|
|
|
def convert_flops(params): |
|
if params == 0: |
|
return "0" |
|
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") |
|
i = int(math.floor(math.log(params, 1000))) |
|
p = math.pow(1000, i) |
|
s = round(params / p, 2) |
|
return f"{s} {size_name[i]}" |
|
|
|
return { |
|
'qkv_flops': convert_flops(qkv_flops), |
|
'attention_matrix_flops': convert_flops(attention_matrix_flops), |
|
'attention_over_values_flops': convert_flops(attention_over_values_flops), |
|
'linear_projection_flops': convert_flops(linear_projection_flops), |
|
'ffn_flops': convert_flops(ffn_flops), |
|
'embedding_flops': convert_flops(embedding_flops), |
|
'total_flops': convert_flops(total_flops) |
|
} |
|
|
|
|
|
|
|
with gr.Blocks(theme="ysharma/TransformerCalculatorNew") as demo: |
|
with gr.Accordion("Credits and General Idea", open=False): |
|
gr.Markdown(""" |
|
This app is a re-creation of [this calculator](https://github.com/EleutherAI/cookbook/tree/main/calc) from EleutherAI. |
|
|
|
Before training or inference even begins, common practical questions about potential models must be answered such as: |
|
1. How many parameters are we targeting? How should those parameters be allocated within the model? |
|
1. How many FLOPs does the model from step 1 take to train on t tokens? How about inference? |
|
1. How much memory does the model from step 1 take to train/infer on d devices? What memory-saving strategies (e.g. parallelism, quantization, etc) are necessary to fit the model on device memory? |
|
""") |
|
with gr.Tab("Memory Calculation"): |
|
|
|
gr.Markdown(""" |
|
## Memory Calculation |
|
|
|
Memory Calculation calculates the amount of device memory required to train or infer a model. See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how memory overhead is calculated. |
|
Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible. |
|
Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error. |
|
(Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are the [ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf). |
|
""") |
|
with gr.Accordion("How to use it?", open=False): |
|
gr.Markdown(""" |
|
## To Use |
|
Fill in the required details below and click 'Calculate Memory' to get a result. |
|
""") |
|
with gr.Row(): |
|
with gr.Column("Generatable"): |
|
gr.Markdown("## Generatable") |
|
with gr.Group(): |
|
hf_model_name_or_path = gr.Textbox( |
|
label="HuggingFace Model Name or Path", |
|
info="Name of the HuggingFace Hub repository or the local file path for it" |
|
) |
|
sequence_length = gr.Number( |
|
label="Sequence Length", |
|
value=2048, |
|
info="Sequence length used for training" |
|
) |
|
vocab_size = gr.Number( |
|
label="Vocab Size", |
|
value=51200, |
|
info="How many tokens are in the embedding layer" |
|
) |
|
hidden_size = gr.Number( |
|
label="Hidden Size", |
|
value=6144, |
|
info="Dimension of the model's hidden size" |
|
) |
|
num_attention_heads = gr.Number( |
|
label="Number of Attention Heads", |
|
value=64, |
|
info="Number of attention heads used in the model" |
|
) |
|
num_layers = gr.Number( |
|
label="Number of Layers", |
|
value=44, |
|
info="Number of transformer layers used in the model" |
|
) |
|
with gr.Column("User Defined"): |
|
gr.Markdown("## User Defined") |
|
num_gpus = gr.Number( |
|
label="Number of GPUs", |
|
value=1, |
|
info="Number of GPUs used for training" |
|
) |
|
tensor_parallel_size = gr.Number( |
|
label="Tensor Parallel Size", |
|
value=1, |
|
info="Tensor parallel degree (1 if not used)" |
|
) |
|
pipeline_parallel_size = gr.Number( |
|
label="Pipeline Parallel Size", |
|
value=1, |
|
info="Pipeline parallel degree (1 if not used)" |
|
) |
|
batch_size_per_gpu = gr.Number( |
|
label="Batch Size per GPU", |
|
value=8, |
|
info="Batch size per GPU" |
|
) |
|
ffn_expansion_factor = gr.Number( |
|
label="FFN Expansion Factor", |
|
value=4, |
|
info="How much the MLP hidden size expands" |
|
) |
|
is_mixed_precision = gr.Checkbox( |
|
label="Mixed Precision", |
|
value=True, |
|
info="Whether mixed precision is enabled" |
|
) |
|
misc_mem_gib = gr.Number( |
|
label="Miscellaneous Memory Overhead (GiB)", |
|
value=5, |
|
info="Miscellaneous memory overhead per GPU by DL frameworks, communication libraries, etc." |
|
) |
|
|
|
calc_memory_button = gr.Button("Calculate Memory") |
|
memory_result = gr.Textbox(label="Memory Calculation Result", interactive=False) |
|
calc_memory_button.click( |
|
calc_mem, |
|
inputs=[ |
|
hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib |
|
], |
|
outputs=memory_result |
|
) |
|
|
|
hf_model_name_or_path.change( |
|
fn=update_from_hf_model, |
|
inputs=[hf_model_name_or_path], |
|
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length, memory_result] |
|
) |
|
|
|
|
|
with gr.TabItem("Parameter Calculation"): |
|
gr.Markdown(""" |
|
## Parameter Calculation |
|
|
|
Parameter Calculation calculates the number of parameters present in a given model based on its hyperparams. |
|
Such calculations are important to determine memory overheads, FLOPs, or to determine the size of an unknown transformer model. |
|
We also found the following resources helpful: |
|
[How does GPT-3 spend its 175B parameters?](https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters) |
|
and [LLM Parameter Counting](https://kipp.ly/transformer-param-count/). |
|
|
|
Note that this exists for `.safetensor` files in the explorer. |
|
|
|
## How To Use |
|
Simply input the model details, such as the hidden size, number of layers, and attention heads, and press 'Calculate Parameters' to get a result. |
|
""") |
|
with gr.Row(): |
|
with gr.Column("Generatable"): |
|
with gr.Group(): |
|
hf_model_name_or_path = gr.Textbox( |
|
label="HuggingFace Model Name or Path", |
|
info="Name of the HuggingFace Hub repository or the local file path for it" |
|
) |
|
vocab_size = gr.Number( |
|
label="Vocab Size", |
|
value=51200, |
|
info="How many tokens are in the embedding layer" |
|
) |
|
hidden_size = gr.Number( |
|
label="Hidden Size", |
|
value=6144, |
|
info="Dimension of the model's hidden size" |
|
) |
|
sequence_length = gr.Number( |
|
label="Sequence Length", |
|
value=2048, |
|
info="Sequence length used for training" |
|
) |
|
num_layers = gr.Number( |
|
label="Number of Layers", |
|
value=44, |
|
info="Number of transformer layers used in the model" |
|
) |
|
with gr.Column("User Defined"): |
|
tied_embeddings = gr.Checkbox( |
|
label="Tied Embeddings", |
|
value=False, |
|
info="Whether embeddings are tied (shared between input and output)" |
|
) |
|
ffn_expansion_factor = gr.Number( |
|
label="FFN Expansion Factor", |
|
value=4, |
|
info="How much the MLP hidden size expands" |
|
) |
|
num_mlp_linears = gr.Number( |
|
label="Number of Linear Layers per MLP Block", |
|
value=2, |
|
info="How many linear layers per MLP block" |
|
) |
|
kv_size_ratio = gr.Number( |
|
label="KV Size Ratio", |
|
value=1.0, |
|
info="Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA" |
|
) |
|
|
|
with gr.Accordion("MoE Parameters", open=False): |
|
moe = gr.Checkbox( |
|
label="MoE", |
|
value=False, |
|
info="Whether the model is MoE" |
|
) |
|
num_experts = gr.Number( |
|
label="Number of Experts", |
|
value=8, |
|
info="Number of experts for MoE" |
|
) |
|
expert_interval = gr.Number( |
|
label="Expert Interval", |
|
value=1, |
|
info="Expert interval for MoE" |
|
) |
|
topk = gr.Number( |
|
label="Top k Routing", |
|
value=1, |
|
info="Top k routing for MoE" |
|
) |
|
|
|
calc_param_button = gr.Button("Calculate Parameters") |
|
param_result = gr.Textbox(label="Parameter Calculation Result", interactive=False) |
|
calc_param_button.click(calc_params, |
|
inputs=[vocab_size, tied_embeddings, hidden_size, sequence_length, num_layers, moe, num_experts, expert_interval, topk, ffn_expansion_factor, num_mlp_linears, kv_size_ratio], |
|
outputs=param_result) |
|
|
|
hf_model_name_or_path.change(fn=update_from_hf_model, |
|
inputs=[hf_model_name_or_path], |
|
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length]) |
|
|
|
|
|
with gr.TabItem("FLOP Calculation"): |
|
gr.Markdown(""" |
|
## FLOP Calculation |
|
|
|
FLOP Calculation calculates the number of theoretical FLOPs required to train a model on t tokens. |
|
See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how FLOPs are calculated. |
|
Other good resources that we consulted are the [Chinchilla Paper](https://arxiv.org/abs/2203.15556) and |
|
[Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://people.eecs.berkeley.edu/~matei/papers/2021/sc_megatron_lm.pdf). |
|
""") |
|
with gr.Row(): |
|
with gr.Column("Generatable"): |
|
with gr.Group(): |
|
hf_model_name_or_path = gr.Textbox( |
|
label="HuggingFace Model Name or Path", |
|
info="Name of the HuggingFace Hub repository or the local file path for it" |
|
) |
|
vocab_size = gr.Number( |
|
label="Vocab Size", |
|
value=51200, |
|
info="How many tokens are in the embedding layer" |
|
) |
|
hidden_size = gr.Number( |
|
label="Hidden Size", |
|
value=6144, |
|
info="Dimension of the model's hidden size" |
|
) |
|
sequence_length = gr.Number( |
|
label="Sequence Length", |
|
value=2048, |
|
info="Sequence length used for training" |
|
) |
|
num_layers = gr.Number( |
|
label="Number of Layers", |
|
value=44, |
|
info="Number of transformer layers used in the model" |
|
) |
|
with gr.Column("Generatable"): |
|
kv_size_ratio = gr.Number( |
|
label="KV Size Ratio", |
|
value=1.0, |
|
info="Ratio of kv heads to query heads used in model. 1.0 for MHA" |
|
) |
|
ffn_expansion_factor = gr.Number( |
|
label="FFN Expansion Factor", |
|
value=4, |
|
info="How much the MLP hidden size expands" |
|
) |
|
batch_size = gr.Number( |
|
label="Batch Size", |
|
value=1, |
|
info="Global batch size in units of samples" |
|
) |
|
tokens = gr.Number( |
|
label="Number of GigaTokens", |
|
value=300, |
|
info="Total number of GigaTokens for training" |
|
) |
|
checkpoint_activations = gr.Checkbox( |
|
label="Checkpoint Activations", |
|
value=True, |
|
info="Whether Megatron-style activation checkpointing is being used" |
|
) |
|
infer = gr.Checkbox( |
|
label="Inference-Only", |
|
value=False, |
|
info="Whether the model is being used for inference-only" |
|
) |
|
|
|
|
|
with gr.Accordion("Mixture of Experts (MoE)", open=False): |
|
moe = gr.Checkbox( |
|
label="Mixture of Experts (MoE)", |
|
value=False, |
|
info="Whether the model uses Mixture of Experts" |
|
) |
|
num_experts = gr.Number( |
|
label="Number of Experts", |
|
value=128, |
|
info="Number of experts for Mixture of Experts (MoE)" |
|
) |
|
expert_interval = gr.Number( |
|
label="Expert Interval", |
|
value=2, |
|
info="Expert interval for Mixture of Experts (MoE)" |
|
) |
|
topk = gr.Number( |
|
label="Top K Routing for MoE", |
|
value=1, |
|
info="Top k routing for Mixture of Experts (MoE)" |
|
) |
|
|
|
calc_flops_button = gr.Button("Calculate FLOPs") |
|
flops_result = gr.JSON(label="FLOP Calculation Result") |
|
calc_flops_button.click( |
|
calc_flops, |
|
inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer], |
|
outputs=flops_result |
|
) |
|
|
|
hf_model_name_or_path.change(fn=update_from_hf_model, |
|
inputs=[hf_model_name_or_path], |
|
outputs=[num_layers, hidden_size, vocab_size, sequence_length]) |
|
|
|
demo.launch() |
|
|