# -*- coding: utf-8 -*- """LLM Training Cost Calculator App.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1iZpCUgC5T_ASnlDgMYm1n4RH8BZsm7sx """ # !pip install gradio import gradio as gr def estimate_training_cost(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85): """ Estimates the training cost of a large language model based on the selected GPU and precision. Args: - gpu_choice (str): The choice of GPU, e.g., 'A100 80GB PCIe', 'V100', etc. - precision (str): The precision level for the GPU, e.g., 'bf16', 'tf32', 'tensor'. - number_of_parameters (int): The number of parameters in the model. - number_of_tokens (int): The number of tokens to train on. - utilization_rate (float, optional): The utilization rate of the GPU (0 < utilization_rate ≤ 1). Default is 0.5 (50%). - overhead (float, optional): Multiplier to account for overhead and additional costs (1 + overhead percentage). Default is 1.10 (10% overhead). - cost_per_gpu_hour (float, optional): The cost per hour of using the GPU. Default is $1.85/hour. Returns: - float: The estimated total cost of training the model. The function dynamically adjusts the GPU throughput based on the selected GPU and precision. The throughput values are predefined for each GPU and precision combination. This estimation assumes a linear scaling of training cost with the number of parameters and tokens. """ gpu_throughputs = { 'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12}, 'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12}, 'V100': {'tensor': 130e12}, # Assuming only the deep learning performance for V100 'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12}, 'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12} } # Get the correct GPU throughput gpu_throughput = gpu_throughputs[gpu_choice][precision] # Calculate the total number of FLOPs required for training total_flops = 6 * number_of_parameters * number_of_tokens # Calculate the number of hours required on the selected GPU gpu_hours = total_flops / (gpu_throughput * 3600) # Adjust for the actual utilization of the GPUs adjusted_gpu_hours = gpu_hours / utilization_rate # Account for the overhead actual_gpu_hours = adjusted_gpu_hours * overhead # Calculate the total cost total_cost = actual_gpu_hours * cost_per_gpu_hour return total_cost def gradio_interface(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate, overhead, cost_per_gpu_hour): number_of_parameters = float(number_of_parameters) * 1e9 # Convert from billions to actual number number_of_tokens = float(number_of_tokens) * 1e12 # Convert from trillions to actual number utilization_rate = float(utilization_rate) overhead = float(overhead) cost_per_gpu_hour = float(cost_per_gpu_hour) cost = estimate_training_cost(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate=utilization_rate, overhead=overhead, cost_per_gpu_hour=cost_per_gpu_hour) return f"The estimated training cost is ${cost:,.2f}" gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"] default_precisions = ['bf16', 'tf32', 'tensor', 'bf16', 'bf16'] # Default precision for each GPU # Define the title and description for the Gradio app title = "

LLM Training Cost Calculator

" description = """

Estimate the cost of training large language models (LLM). This tool helps you calculate the cost based on model parameters, tokens, and GPU selections with various precision options. Select a GPU and the precision level to get an accurate cost estimate.

Available GPUs and Precisions:

The choice of GPU and precision impacts the throughput, affecting training time and cost. BFLOAT16 is generally faster and more cost-effective, while Tensor Float 32 offers higher precision. The V100 GPU is optimized for Deep Learning with Tensor Cores.

We plan to extend this calculator to include calculating the cost of fine-tuning models using strategies like LoRA or QLoRA. Stay tuned for updates where you'll be able to input the model ID from the Hugging Face Hub, select the fine-tuning strategy, and specify quantization details if QLoRA is chosen.

""" iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'), gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'), gr.Textbox(label="Number of Parameters (in billions)", value="70"), gr.Textbox(label="Number of Tokens (in trillions)", value="2"), gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"), gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead (1 + overhead percentage)"), gr.Textbox(label="Cost per GPU Hour ($)", value="1.85") ], outputs=[gr.Textbox(label="Estimated Training Cost")], title=title, description=description, article="

Developed with ❤️ by Elfilali Ali

" ) iface.launch()