import os import re import webbrowser import pandas as pd import gradio as gr from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError from accelerate.commands.estimate import create_empty_model, check_has_model from accelerate.utils import convert_bytes, calculate_maximum_sizes # We need to store them as globals because gradio doesn't have a way for us to pass them in to the button HAS_DISCUSSION = True MODEL_NAME = None LIBRARY = None USER_TOKEN = None TOKEN = os.environ.get("HUGGINGFACE_API_LOGIN", None) def check_for_discussion(model_name:str): "Checks if an automated discussion has been opened on the model by `model-sizer-bot`" global TOKEN api = HfApi(token=TOKEN) discussions = list(api.get_repo_discussions(model_name)) return any(discussion.title == "[AUTOMATED] Model Memory Requirements" and discussion.author == "model-sizer-bot" for discussion in discussions) def report_results(): "Reports the results of a memory calculation to the model's discussion page, and opens a new tab to it afterwards" global MODEL_NAME, LIBRARY, TOKEN, USER_TOKEN api = HfApi(token=TOKEN) results, data = calculate_memory(MODEL_NAME, LIBRARY, ["fp32", "fp16", "int8", "int4"], access_token=USER_TOKEN, raw=True) minimum = data[0] USER_TOKEN = None post = f"""# Model Memory Requirements\n You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam. These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub. The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer". When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model. When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer). ## Results: {results} """ discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post) webbrowser.open_new_tab(discussion.url) def convert_url_to_name(url:str): "Converts a model URL to its name on the Hub" results = re.findall(r"huggingface.co\/(.*?)#", url) if len(results) < 1: raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub") return results[0] # Based on the following doc: # # - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory # - https://blog.eleuther.ai/transformer-math/ # - https://kipp.ly/transformer-inference-arithmetic/ # - https://github.com/ray-project/llm-numbers # def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing): # is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16 # if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter: # return { 'supported': False } # Model Weights # # Hf doc counts: # # - 4 bytes * number of parameters for fp32 training # - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) # # But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training, # leave the rest to optimizor state. # # Here we calculate only for fp32, will adjust for each dtype outside. # # for param in model.parameters(): # print(f'{type(param)} {param.shape} {param.element_size()}') # # print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}') param_element_size = 4 vram_model = sum([param.nelement() * param_element_size for param in model.parameters()]) # Buffers # # Buffers are tensors that do not require gradients and not registered as parameters. # e.g. mean and std in batch norm layers. # - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897 # - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 # # for buf in model.buffers(): # print(f'buf.element_size() = {buf.element_size()}') vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) # Optimizer States: # - 8 bytes * number of parameters for normal AdamW (maintains 2 states) # - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes # - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) # # For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers # ADAMW_HF = "adamw_hf" # ADAMW_TORCH = "adamw_torch" # ADAMW_TORCH_FUSED = "adamw_torch_fused" # ADAMW_TORCH_XLA = "adamw_torch_xla" # ADAMW_APEX_FUSED = "adamw_apex_fused" # ADAFACTOR = "adafactor" # ADAMW_ANYPRECISION = "adamw_anyprecision" # SGD = "sgd" # ADAGRAD = "adagrad" # ADAMW_BNB = "adamw_bnb_8bit" # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit # LION_8BIT = "lion_8bit" # LION = "lion_32bit" # PAGED_ADAMW = "paged_adamw_32bit" # PAGED_ADAMW_8BIT = "paged_adamw_8bit" # PAGED_LION = "paged_lion_32bit" # PAGED_LION_8BIT = "paged_lion_8bit" # optimizer = cfg.optimizer optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8) vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()]) # Gradients # # 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) # but we will follow transformer-math to treat it conditionally outside # for now we ignores whether is mixed precision training # gradient_element_size = 4 # 2 if is_16bit else 4 vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()]) # Forward Activations # size depends on many factors, the key ones being sequence length, hidden size and batch size. s = sequence_len # cfg.sequence_len b = micro_batch_size # cfg.micro_batch_size h = model.config.hidden_size L = model.config.num_hidden_layers t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids) #torch.cuda.device_count() a = model.config.num_attention_heads print(f's={s} b={b} h={h} L={L} t={t} a={a}') sbHL = s * b * h * L print(f'sbHL = {sbHL / 1e9} GB') print(f'10 + {24 / t} + {5 * a * s / (h * t)}') vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t)) return { # 'supported': True, 'param_element_size': param_element_size, 'total': vram_model + vram_buffer + vram_optimizer + vram_activation, 'model': vram_model, 'buffer': vram_buffer, 'optimizer': vram_optimizer, 'activation': vram_activation, } def bytes_by_dtype(bytes, dtype): if dtype in ("fp16", "bf16", "float16/bfloat16"): return bytes / 2 elif dtype == "int8": return bytes / 4 elif dtype == "int4": return bytes / 8 else: return bytes def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False): "Calculates the memory usage for a model" if library == "auto": library = None if "http" in model_name and "//" in model_name: try: model_name = convert_url_to_name(model_name) except ValueError: raise gr.Error(f"URL `{model_name}` is not a valid model URL to the Hugging Face Hub") try: model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token) except GatedRepoError: raise gr.Error(f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. ") except RepositoryNotFoundError: raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.") except ValueError as e: raise gr.Error(f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)") except (RuntimeError, OSError) as e: library = check_has_model(e) if library != "unknown": raise gr.Error(f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo.") total_size, largest_layer = calculate_maximum_sizes(model) data = [] title = f"Memory Usage for '{model_name}'" vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True) for dtype in dtypes: param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype) vram_model = bytes_by_dtype(vram_f32['model'], dtype) vram_buffer = vram_f32['buffer'] vram_optimizer = vram_f32['optimizer'] vram_activation = vram_f32['activation'] row = { "dtype": dtype, 'inference_total': convert_bytes(vram_model), 'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation), 'model': convert_bytes(vram_model), 'buffer': convert_bytes(vram_buffer), 'optimizer': convert_bytes(vram_optimizer), 'activation': convert_bytes(vram_activation), } data.append(row) # dtype_total_size = total_size # dtype_largest_layer = largest_layer[0] # if dtype in ("fp16", "bf16", "float16/bfloat16"): # dtype_total_size /= 2 # dtype_largest_layer /= 2 # elif dtype == "int8": # dtype_total_size /= 4 # dtype_largest_layer /= 4 # elif dtype == "int4": # dtype_total_size /= 8 # dtype_largest_layer /= 8 # dtype_training_size = convert_bytes(dtype_total_size * 4) # dtype_total_size = convert_bytes(dtype_total_size) # dtype_largest_layer = convert_bytes(dtype_largest_layer) # data.append({ # "dtype": dtype, # "Largest Layer or Residual Group": dtype_largest_layer, # "Total Size": dtype_total_size, # "Training using Adam": dtype_training_size, # "Test": 12345 # }) # data.append({ # "dtype": dtype, # "Largest Layer or Residual Group": dtype_largest_layer, # "Total Size": dtype_total_size, # "Training using Adam": dtype_training_size, # "Test": 12345 # }) global HAS_DISCUSSION, MODEL_NAME, LIBRARY HAS_DISCUSSION = check_for_discussion(model_name) MODEL_NAME = model_name LIBRARY = library if raw: return pd.DataFrame(data).to_markdown(index=False), data results = [ f'## {title}', gr.update(visible=True, value=pd.DataFrame(data)), # gr.update(visible=not HAS_DISCUSSION) ] return results with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """

🤗 Model Memory Calculator

This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes: - Focus on transformers and gives more detailed estimation based on more configs - Will auto-calculate the proper batch size given a VRAM constraint later - LoRA/QLoRA etc. will be supported later Note: - inference_total = model - training_total = model + buffer + optimizer + activation """ ) out_text = gr.Markdown() out = gr.DataFrame(headers=[ "dtype", 'inference_total', 'training_total', 'model', 'buffer', 'optimizer', 'activation', ], interactive=False, visible=False, ) with gr.Row(): inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased") with gr.Row(): library = gr.Radio(["transformers"], label="Library", value="transformers") dtypes = gr.CheckboxGroup( ["float32", "float16/bfloat16", "int8", "int4"], value=["float32", "float16/bfloat16", "int8", "int4"], label="Model Precision", ) # ADAMW_HF = "adamw_hf" # ADAMW_TORCH = "adamw_torch" # ADAMW_TORCH_FUSED = "adamw_torch_fused" # ADAMW_TORCH_XLA = "adamw_torch_xla" # ADAMW_APEX_FUSED = "adamw_apex_fused" # ADAFACTOR = "adafactor" # ADAMW_ANYPRECISION = "adamw_anyprecision" # SGD = "sgd" # ADAGRAD = "adagrad" # ADAMW_BNB = "adamw_bnb_8bit" # ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit # LION_8BIT = "lion_8bit" # LION = "lion_32bit" # PAGED_ADAMW = "paged_adamw_32bit" # PAGED_ADAMW_8BIT = "paged_adamw_8bit" # PAGED_LION = "paged_lion_32bit" # PAGED_LION_8BIT = "paged_lion_8bit" optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"], value="adamw_hf", label="Optimizer", allow_custom_value=True) access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)") with gr.Row(): btn = gr.Button("Calculate Memory Usage") # post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False) USER_TOKEN = access_token btn.click( calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out], ) # post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub) demo.launch() # (share=True, inline=False, debug=True)