Add LoRA/QLoRA modes, model comparison, search, throughput, cost estimates, and export
Browse filesNew features:
- LoRA and QLoRA fine-tuning memory estimation modes
- Model comparison tab for side-by-side VRAM analysis
- Model search with HuggingFace API integration
- Throughput estimation (tokens/sec) per GPU
- Cloud cost estimates (hourly/daily/monthly)
- Flash Attention toggle with memory savings display
- Export results to JSON or plain text
- Tabbed interface (Calculator, Compare, Export)
Updated GPU specs with TFLOPs and hourly costs for cloud instances.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- .gitignore +2 -0
- app.py +674 -136
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/*
|
| 2 |
+
.claude/*
|
app.py
CHANGED
|
@@ -7,40 +7,45 @@ Fetches model metadata from HF Hub and calculates:
|
|
| 7 |
- Recommended GPUs and cloud instances
|
| 8 |
- Multi-GPU tensor parallelism estimates
|
| 9 |
- Quantization options with detailed breakdown
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
-
from huggingface_hub import HfApi, hf_hub_download
|
| 14 |
import json
|
| 15 |
from functools import lru_cache
|
|
|
|
| 16 |
|
| 17 |
# Initialize HF API client
|
| 18 |
api = HfApi()
|
| 19 |
|
| 20 |
-
# GPU specs: name -> (VRAM in GB, typical cloud instance, category)
|
| 21 |
GPU_SPECS = {
|
| 22 |
# Consumer GPUs
|
| 23 |
-
"RTX 3080": (10, "Consumer", "consumer"),
|
| 24 |
-
"RTX 3090": (24, "Consumer", "consumer"),
|
| 25 |
-
"RTX 4080": (16, "Consumer", "consumer"),
|
| 26 |
-
"RTX 4090": (24, "Consumer", "consumer"),
|
| 27 |
-
"RTX 5090": (32, "Consumer (est.)", "consumer"),
|
| 28 |
# Apple Silicon
|
| 29 |
-
"M2 Ultra": (192, "Mac Studio (Unified)", "apple"),
|
| 30 |
-
"M3 Max": (128, "MacBook Pro (Unified)", "apple"),
|
| 31 |
-
"M4 Max": (128, "MacBook Pro (Unified)", "apple"),
|
| 32 |
# Workstation GPUs
|
| 33 |
-
"RTX A6000": (48, "Workstation", "workstation"),
|
| 34 |
-
"L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud"),
|
| 35 |
# Cloud GPUs
|
| 36 |
-
"A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud"),
|
| 37 |
-
"L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud"),
|
| 38 |
-
"A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud"),
|
| 39 |
-
"A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud"),
|
| 40 |
-
"H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud"),
|
| 41 |
-
"H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud"),
|
| 42 |
# AMD GPUs
|
| 43 |
-
"MI300X": (192, "AMD Cloud Instances", "amd"),
|
| 44 |
}
|
| 45 |
|
| 46 |
# Bytes per element for different dtypes
|
|
@@ -236,6 +241,175 @@ def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism:
|
|
| 236 |
}
|
| 237 |
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
def calculate_vram(
|
| 240 |
model_id: str,
|
| 241 |
context_length: int = 4096,
|
|
@@ -244,7 +418,11 @@ def calculate_vram(
|
|
| 244 |
optimizer: str = "AdamW",
|
| 245 |
serving_framework: str = "None (raw PyTorch)",
|
| 246 |
num_gpus: int = 1,
|
| 247 |
-
parallelism: str = "Tensor Parallelism"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
) -> tuple[str, dict | None]:
|
| 249 |
"""Main calculation function. Returns (markdown_results, chart_data)."""
|
| 250 |
|
|
@@ -331,13 +509,25 @@ def calculate_vram(
|
|
| 331 |
results.append("Could not find architecture details")
|
| 332 |
kv_gb = 0
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# Calculate total based on mode
|
| 335 |
-
if mode == "Training":
|
| 336 |
training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
|
| 337 |
base_gb = bytes_to_gb(training_mem["total_base"])
|
| 338 |
|
| 339 |
# Activations estimation (rough: ~2x weights for typical batch)
|
| 340 |
activation_gb = weights_gb * 2 * batch_size
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
total_gb = base_gb + kv_gb + activation_gb
|
| 342 |
|
| 343 |
results.append(f"\n### 🎓 Training Memory Breakdown")
|
|
@@ -354,11 +544,58 @@ def calculate_vram(
|
|
| 354 |
"KV Cache": kv_gb,
|
| 355 |
"Activations": activation_gb,
|
| 356 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
else:
|
| 358 |
# Inference mode
|
| 359 |
framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
|
| 360 |
base_total = weights_gb + kv_gb
|
| 361 |
overhead_gb = base_total * (framework_overhead - 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
total_gb = base_total + overhead_gb
|
| 363 |
|
| 364 |
results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
|
|
@@ -372,6 +609,12 @@ def calculate_vram(
|
|
| 372 |
"Overhead": overhead_gb,
|
| 373 |
}
|
| 374 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
|
| 376 |
|
| 377 |
# Multi-GPU calculations
|
|
@@ -389,14 +632,22 @@ def calculate_vram(
|
|
| 389 |
|
| 390 |
# GPU Recommendations
|
| 391 |
results.append(f"\n### 🎮 GPU Recommendations")
|
| 392 |
-
results.append("| GPU | VRAM | Fits? | Headroom | Instance |")
|
| 393 |
-
results.append("
|
| 394 |
|
| 395 |
-
for gpu_name, (vram, instance, category) in GPU_SPECS.items():
|
| 396 |
fits = "✅" if vram >= effective_vram_needed else "❌"
|
| 397 |
headroom = vram - effective_vram_needed
|
| 398 |
headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
# Quantization options (if model doesn't fit on consumer GPUs)
|
| 402 |
if effective_vram_needed > 24:
|
|
@@ -413,6 +664,17 @@ def calculate_vram(
|
|
| 413 |
|
| 414 |
results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
|
| 415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
return "\n".join(results), chart_data
|
| 417 |
|
| 418 |
|
|
@@ -434,147 +696,423 @@ def create_memory_chart(chart_data: dict | None):
|
|
| 434 |
)
|
| 435 |
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
# Build Gradio interface
|
| 438 |
with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
|
| 439 |
gr.Markdown("""
|
| 440 |
-
#
|
| 441 |
|
| 442 |
-
Estimate GPU memory requirements for HuggingFace models. Supports inference
|
| 443 |
-
multi-GPU setups, and
|
| 444 |
""")
|
| 445 |
|
| 446 |
-
with gr.
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
)
|
| 453 |
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
)
|
| 462 |
-
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
label="Context Length",
|
| 465 |
minimum=512,
|
| 466 |
maximum=131072,
|
| 467 |
value=4096,
|
| 468 |
step=512,
|
| 469 |
-
info="Sequence length for KV cache"
|
| 470 |
-
)
|
| 471 |
-
with gr.Column(scale=1):
|
| 472 |
-
batch_input = gr.Slider(
|
| 473 |
-
label="Batch Size",
|
| 474 |
-
minimum=1,
|
| 475 |
-
maximum=64,
|
| 476 |
-
value=1,
|
| 477 |
-
step=1,
|
| 478 |
-
info="Concurrent sequences"
|
| 479 |
)
|
|
|
|
|
|
|
| 480 |
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
value="None (raw PyTorch)",
|
| 487 |
-
label="Serving Framework",
|
| 488 |
-
info="Different frameworks have different overhead"
|
| 489 |
-
)
|
| 490 |
-
optimizer_input = gr.Dropdown(
|
| 491 |
-
choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
|
| 492 |
-
value="AdamW",
|
| 493 |
-
label="Optimizer (Training mode)",
|
| 494 |
-
info="Optimizer state memory varies"
|
| 495 |
-
)
|
| 496 |
-
with gr.Column():
|
| 497 |
-
num_gpus_input = gr.Slider(
|
| 498 |
-
label="Number of GPUs",
|
| 499 |
-
minimum=1,
|
| 500 |
-
maximum=8,
|
| 501 |
-
value=1,
|
| 502 |
-
step=1,
|
| 503 |
-
info="For multi-GPU setups"
|
| 504 |
-
)
|
| 505 |
-
parallelism_input = gr.Dropdown(
|
| 506 |
-
choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
|
| 507 |
-
value="Tensor Parallelism",
|
| 508 |
-
label="Parallelism Strategy",
|
| 509 |
-
info="How to distribute across GPUs"
|
| 510 |
-
)
|
| 511 |
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
x="Component",
|
| 520 |
-
y="GB",
|
| 521 |
-
title="Memory Breakdown",
|
| 522 |
-
height=350,
|
| 523 |
)
|
| 524 |
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
"
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
|
|
|
|
|
|
| 546 |
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
["mistralai/Mistral-7B-v0.1", 8192, 1],
|
| 553 |
-
["Qwen/Qwen2.5-72B", 32768, 1],
|
| 554 |
-
["google/gemma-2-27b", 8192, 1],
|
| 555 |
-
["microsoft/phi-4", 16384, 1],
|
| 556 |
-
["deepseek-ai/DeepSeek-V3", 4096, 1],
|
| 557 |
-
["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
|
| 558 |
-
],
|
| 559 |
-
inputs=[model_input, context_input, batch_input],
|
| 560 |
-
label="🔥 Popular Models"
|
| 561 |
-
)
|
| 562 |
|
|
|
|
| 563 |
gr.Markdown("""
|
| 564 |
---
|
| 565 |
-
###
|
| 566 |
- **Inference mode:** Weights + KV cache + framework overhead
|
| 567 |
-
- **Training
|
| 568 |
- **KV cache:** Scales linearly with context length and batch size
|
| 569 |
- **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
|
| 570 |
- **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
|
| 571 |
|
| 572 |
-
###
|
| 573 |
- Estimates are approximate; actual usage varies by implementation
|
| 574 |
-
- Flash Attention and other optimizations can
|
| 575 |
-
-
|
| 576 |
|
| 577 |
-
Built with
|
| 578 |
""")
|
| 579 |
|
| 580 |
|
|
|
|
| 7 |
- Recommended GPUs and cloud instances
|
| 8 |
- Multi-GPU tensor parallelism estimates
|
| 9 |
- Quantization options with detailed breakdown
|
| 10 |
+
- Model comparison across multiple models
|
| 11 |
+
- Throughput estimation
|
| 12 |
+
- Cloud cost analysis
|
| 13 |
+
- LoRA/QLoRA fine-tuning memory requirements
|
| 14 |
"""
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
+
from huggingface_hub import HfApi, hf_hub_download, list_models
|
| 18 |
import json
|
| 19 |
from functools import lru_cache
|
| 20 |
+
from datetime import datetime
|
| 21 |
|
| 22 |
# Initialize HF API client
|
| 23 |
api = HfApi()
|
| 24 |
|
| 25 |
+
# GPU specs: name -> (VRAM in GB, typical cloud instance, category, hourly_cost, tflops_fp16)
|
| 26 |
GPU_SPECS = {
|
| 27 |
# Consumer GPUs
|
| 28 |
+
"RTX 3080": (10, "Consumer", "consumer", 0, 29.8),
|
| 29 |
+
"RTX 3090": (24, "Consumer", "consumer", 0, 35.6),
|
| 30 |
+
"RTX 4080": (16, "Consumer", "consumer", 0, 48.7),
|
| 31 |
+
"RTX 4090": (24, "Consumer", "consumer", 0, 82.6),
|
| 32 |
+
"RTX 5090": (32, "Consumer (est.)", "consumer", 0, 105.0),
|
| 33 |
# Apple Silicon
|
| 34 |
+
"M2 Ultra": (192, "Mac Studio (Unified)", "apple", 0, 27.2),
|
| 35 |
+
"M3 Max": (128, "MacBook Pro (Unified)", "apple", 0, 14.2),
|
| 36 |
+
"M4 Max": (128, "MacBook Pro (Unified)", "apple", 0, 18.0),
|
| 37 |
# Workstation GPUs
|
| 38 |
+
"RTX A6000": (48, "Workstation", "workstation", 0, 38.7),
|
| 39 |
+
"L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud", 1.00, 91.6),
|
| 40 |
# Cloud GPUs
|
| 41 |
+
"A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud", 1.00, 31.2),
|
| 42 |
+
"L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud", 0.70, 30.3),
|
| 43 |
+
"A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud", 3.00, 77.9),
|
| 44 |
+
"A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud", 5.00, 77.9),
|
| 45 |
+
"H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud", 8.00, 267.6),
|
| 46 |
+
"H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud", 12.00, 296.0),
|
| 47 |
# AMD GPUs
|
| 48 |
+
"MI300X": (192, "AMD Cloud Instances", "amd", 6.00, 383.0),
|
| 49 |
}
|
| 50 |
|
| 51 |
# Bytes per element for different dtypes
|
|
|
|
| 241 |
}
|
| 242 |
|
| 243 |
|
| 244 |
+
def estimate_lora_memory(
|
| 245 |
+
param_count: int,
|
| 246 |
+
dtype_bytes: int,
|
| 247 |
+
lora_rank: int = 16,
|
| 248 |
+
lora_alpha: int = 32,
|
| 249 |
+
target_modules: int = 4,
|
| 250 |
+
use_qlora: bool = False
|
| 251 |
+
) -> dict:
|
| 252 |
+
"""
|
| 253 |
+
Estimate LoRA/QLoRA fine-tuning memory requirements.
|
| 254 |
+
|
| 255 |
+
LoRA adds low-rank adaptation matrices to specific layers.
|
| 256 |
+
QLoRA additionally quantizes the base model to 4-bit.
|
| 257 |
+
"""
|
| 258 |
+
# Base model weights
|
| 259 |
+
if use_qlora:
|
| 260 |
+
# QLoRA: 4-bit quantized weights
|
| 261 |
+
base_weights_bytes = param_count * 0.5 # 4-bit = 0.5 bytes/param
|
| 262 |
+
else:
|
| 263 |
+
base_weights_bytes = param_count * dtype_bytes
|
| 264 |
+
|
| 265 |
+
# LoRA adapter parameters (A and B matrices for each target module)
|
| 266 |
+
# Typical target modules: q_proj, k_proj, v_proj, o_proj (4 modules)
|
| 267 |
+
# Each LoRA layer: hidden_size * rank (A) + rank * hidden_size (B)
|
| 268 |
+
# Approximate as 2 * hidden_size * rank per module
|
| 269 |
+
# For simplicity, estimate based on total params
|
| 270 |
+
lora_params_ratio = (lora_rank * 2 * target_modules) / 1000 # Rough estimate
|
| 271 |
+
lora_params = int(param_count * lora_params_ratio * 0.01) # Usually ~0.1-1% of base
|
| 272 |
+
lora_weights_bytes = lora_params * dtype_bytes
|
| 273 |
+
|
| 274 |
+
# Gradients only for LoRA params (not frozen base)
|
| 275 |
+
gradients_bytes = lora_params * dtype_bytes
|
| 276 |
+
|
| 277 |
+
# Optimizer states for LoRA params only
|
| 278 |
+
optimizer_bytes = lora_params * 4 * 2 # AdamW: 2 states, 4 bytes each
|
| 279 |
+
|
| 280 |
+
# Activations (still needed, but can use gradient checkpointing)
|
| 281 |
+
activation_bytes = base_weights_bytes * 0.5 # Reduced with checkpointing
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
"base_weights": base_weights_bytes,
|
| 285 |
+
"lora_weights": lora_weights_bytes,
|
| 286 |
+
"lora_params": lora_params,
|
| 287 |
+
"gradients": gradients_bytes,
|
| 288 |
+
"optimizer": optimizer_bytes,
|
| 289 |
+
"activations": activation_bytes,
|
| 290 |
+
"total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
|
| 291 |
+
"vs_full_finetune_ratio": 0.3 if use_qlora else 0.5, # Rough memory savings
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def estimate_throughput(
|
| 296 |
+
param_count: int,
|
| 297 |
+
gpu_tflops: float,
|
| 298 |
+
batch_size: int = 1,
|
| 299 |
+
context_length: int = 4096,
|
| 300 |
+
is_prefill: bool = False
|
| 301 |
+
) -> dict:
|
| 302 |
+
"""
|
| 303 |
+
Estimate tokens per second throughput.
|
| 304 |
+
|
| 305 |
+
Based on roofline model: throughput limited by compute or memory bandwidth.
|
| 306 |
+
Most LLM inference is memory-bound for single-batch decode.
|
| 307 |
+
"""
|
| 308 |
+
# Rough estimate: 2 FLOPs per parameter per token (forward pass)
|
| 309 |
+
flops_per_token = 2 * param_count
|
| 310 |
+
|
| 311 |
+
# Peak theoretical throughput (compute-bound)
|
| 312 |
+
peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
|
| 313 |
+
|
| 314 |
+
# Memory-bound estimate (more realistic for decode)
|
| 315 |
+
# Assume ~1TB/s memory bandwidth for modern GPUs
|
| 316 |
+
memory_bandwidth_tbs = 1.0 # TB/s, rough average
|
| 317 |
+
bytes_per_token = param_count * 2 # FP16 weights need to be read
|
| 318 |
+
memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
|
| 319 |
+
|
| 320 |
+
# Prefill is more compute-bound, decode is memory-bound
|
| 321 |
+
if is_prefill:
|
| 322 |
+
effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
|
| 323 |
+
else:
|
| 324 |
+
effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
|
| 325 |
+
|
| 326 |
+
# Apply realistic efficiency factor (typically 30-60% of theoretical)
|
| 327 |
+
efficiency = 0.4
|
| 328 |
+
realistic_tokens = effective_tokens * efficiency
|
| 329 |
+
|
| 330 |
+
return {
|
| 331 |
+
"peak_theoretical": peak_tokens_per_sec,
|
| 332 |
+
"memory_bound": memory_bound_tokens,
|
| 333 |
+
"estimated_tokens_per_sec": realistic_tokens,
|
| 334 |
+
"batch_size": batch_size,
|
| 335 |
+
"is_prefill": is_prefill,
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def calculate_cost_estimate(
|
| 340 |
+
vram_required: float,
|
| 341 |
+
hours_per_day: float = 8,
|
| 342 |
+
days_per_month: float = 22
|
| 343 |
+
) -> list:
|
| 344 |
+
"""Calculate cost estimates for cloud GPUs that fit the model."""
|
| 345 |
+
estimates = []
|
| 346 |
+
|
| 347 |
+
for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
|
| 348 |
+
if vram >= vram_required and hourly_cost > 0:
|
| 349 |
+
daily_cost = hourly_cost * hours_per_day
|
| 350 |
+
monthly_cost = daily_cost * days_per_month
|
| 351 |
+
estimates.append({
|
| 352 |
+
"gpu": gpu_name,
|
| 353 |
+
"vram": vram,
|
| 354 |
+
"hourly": hourly_cost,
|
| 355 |
+
"daily": daily_cost,
|
| 356 |
+
"monthly": monthly_cost,
|
| 357 |
+
"instance": instance,
|
| 358 |
+
})
|
| 359 |
+
|
| 360 |
+
return sorted(estimates, key=lambda x: x["hourly"])
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def search_models(query: str, limit: int = 10) -> list:
|
| 364 |
+
"""Search HuggingFace models by name."""
|
| 365 |
+
if not query or len(query) < 2:
|
| 366 |
+
return []
|
| 367 |
+
|
| 368 |
+
try:
|
| 369 |
+
models = list(list_models(
|
| 370 |
+
search=query,
|
| 371 |
+
sort="downloads",
|
| 372 |
+
direction=-1,
|
| 373 |
+
limit=limit,
|
| 374 |
+
filter="text-generation"
|
| 375 |
+
))
|
| 376 |
+
return [m.id for m in models]
|
| 377 |
+
except Exception:
|
| 378 |
+
return []
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def calculate_flash_attention_savings(
|
| 382 |
+
kv_cache_bytes: int,
|
| 383 |
+
context_length: int
|
| 384 |
+
) -> dict:
|
| 385 |
+
"""
|
| 386 |
+
Estimate memory savings from Flash Attention.
|
| 387 |
+
|
| 388 |
+
Flash Attention uses tiling to reduce memory from O(n^2) to O(n).
|
| 389 |
+
"""
|
| 390 |
+
# Standard attention materializes full attention matrix
|
| 391 |
+
# Flash Attention streams through, never materializing full matrix
|
| 392 |
+
# Savings primarily in activation memory, not KV cache
|
| 393 |
+
|
| 394 |
+
# KV cache itself is O(n), so Flash Attention doesn't reduce it
|
| 395 |
+
# But it dramatically reduces peak memory during computation
|
| 396 |
+
|
| 397 |
+
# Estimate: Flash Attention reduces peak memory by avoiding
|
| 398 |
+
# the O(n^2) attention matrix materialization
|
| 399 |
+
standard_attention_overhead = context_length * context_length * 2 # FP16
|
| 400 |
+
flash_attention_overhead = context_length * 128 * 2 # Block size overhead
|
| 401 |
+
|
| 402 |
+
savings_bytes = standard_attention_overhead - flash_attention_overhead
|
| 403 |
+
savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
|
| 404 |
+
|
| 405 |
+
return {
|
| 406 |
+
"standard_overhead_gb": bytes_to_gb(standard_attention_overhead),
|
| 407 |
+
"flash_overhead_gb": bytes_to_gb(flash_attention_overhead),
|
| 408 |
+
"savings_gb": bytes_to_gb(savings_bytes),
|
| 409 |
+
"savings_percent": savings_ratio * 100,
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
|
| 413 |
def calculate_vram(
|
| 414 |
model_id: str,
|
| 415 |
context_length: int = 4096,
|
|
|
|
| 418 |
optimizer: str = "AdamW",
|
| 419 |
serving_framework: str = "None (raw PyTorch)",
|
| 420 |
num_gpus: int = 1,
|
| 421 |
+
parallelism: str = "Tensor Parallelism",
|
| 422 |
+
use_flash_attention: bool = True,
|
| 423 |
+
lora_rank: int = 16,
|
| 424 |
+
show_throughput: bool = True,
|
| 425 |
+
show_cost: bool = True
|
| 426 |
) -> tuple[str, dict | None]:
|
| 427 |
"""Main calculation function. Returns (markdown_results, chart_data)."""
|
| 428 |
|
|
|
|
| 509 |
results.append("Could not find architecture details")
|
| 510 |
kv_gb = 0
|
| 511 |
|
| 512 |
+
# Flash Attention savings
|
| 513 |
+
flash_savings = None
|
| 514 |
+
if use_flash_attention and kv_gb > 0:
|
| 515 |
+
kv_bytes = estimate_kv_cache_size(
|
| 516 |
+
num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
|
| 517 |
+
)
|
| 518 |
+
flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
|
| 519 |
+
|
| 520 |
# Calculate total based on mode
|
| 521 |
+
if mode == "Training (Full)":
|
| 522 |
training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
|
| 523 |
base_gb = bytes_to_gb(training_mem["total_base"])
|
| 524 |
|
| 525 |
# Activations estimation (rough: ~2x weights for typical batch)
|
| 526 |
activation_gb = weights_gb * 2 * batch_size
|
| 527 |
+
if use_flash_attention and flash_savings:
|
| 528 |
+
activation_gb -= flash_savings["savings_gb"]
|
| 529 |
+
activation_gb = max(0.1, activation_gb)
|
| 530 |
+
|
| 531 |
total_gb = base_gb + kv_gb + activation_gb
|
| 532 |
|
| 533 |
results.append(f"\n### 🎓 Training Memory Breakdown")
|
|
|
|
| 544 |
"KV Cache": kv_gb,
|
| 545 |
"Activations": activation_gb,
|
| 546 |
}
|
| 547 |
+
|
| 548 |
+
elif mode == "LoRA Fine-tuning":
|
| 549 |
+
lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
|
| 550 |
+
total_gb = bytes_to_gb(lora_mem["total"])
|
| 551 |
+
|
| 552 |
+
results.append(f"\n### 🔧 LoRA Fine-tuning (rank={lora_rank})")
|
| 553 |
+
results.append(f"- **Base weights (frozen):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
|
| 554 |
+
results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
|
| 555 |
+
results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
|
| 556 |
+
results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
|
| 557 |
+
results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
|
| 558 |
+
results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
|
| 559 |
+
|
| 560 |
+
chart_data = {
|
| 561 |
+
"Base Weights": bytes_to_gb(lora_mem['base_weights']),
|
| 562 |
+
"LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
|
| 563 |
+
"Gradients": bytes_to_gb(lora_mem['gradients']),
|
| 564 |
+
"Optimizer": bytes_to_gb(lora_mem['optimizer']),
|
| 565 |
+
"Activations": bytes_to_gb(lora_mem['activations']),
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
elif mode == "QLoRA Fine-tuning":
|
| 569 |
+
lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
|
| 570 |
+
total_gb = bytes_to_gb(lora_mem["total"])
|
| 571 |
+
|
| 572 |
+
results.append(f"\n### 🔧 QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
|
| 573 |
+
results.append(f"- **Base weights (4-bit):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
|
| 574 |
+
results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
|
| 575 |
+
results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
|
| 576 |
+
results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
|
| 577 |
+
results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
|
| 578 |
+
results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
|
| 579 |
+
|
| 580 |
+
chart_data = {
|
| 581 |
+
"Base (4-bit)": bytes_to_gb(lora_mem['base_weights']),
|
| 582 |
+
"LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
|
| 583 |
+
"Gradients": bytes_to_gb(lora_mem['gradients']),
|
| 584 |
+
"Optimizer": bytes_to_gb(lora_mem['optimizer']),
|
| 585 |
+
"Activations": bytes_to_gb(lora_mem['activations']),
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
else:
|
| 589 |
# Inference mode
|
| 590 |
framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
|
| 591 |
base_total = weights_gb + kv_gb
|
| 592 |
overhead_gb = base_total * (framework_overhead - 1)
|
| 593 |
+
|
| 594 |
+
# Flash Attention reduces activation memory overhead during inference
|
| 595 |
+
if use_flash_attention and flash_savings:
|
| 596 |
+
overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
|
| 597 |
+
overhead_gb = max(0, overhead_gb)
|
| 598 |
+
|
| 599 |
total_gb = base_total + overhead_gb
|
| 600 |
|
| 601 |
results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
|
|
|
|
| 609 |
"Overhead": overhead_gb,
|
| 610 |
}
|
| 611 |
|
| 612 |
+
# Flash Attention info
|
| 613 |
+
if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
|
| 614 |
+
results.append(f"\n### ⚡ Flash Attention")
|
| 615 |
+
results.append(f"- **Enabled:** Yes")
|
| 616 |
+
results.append(f"- **Peak memory savings:** ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
|
| 617 |
+
|
| 618 |
results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
|
| 619 |
|
| 620 |
# Multi-GPU calculations
|
|
|
|
| 632 |
|
| 633 |
# GPU Recommendations
|
| 634 |
results.append(f"\n### 🎮 GPU Recommendations")
|
| 635 |
+
results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
|
| 636 |
+
results.append("|-----|------|-------|----------|------------|----------|")
|
| 637 |
|
| 638 |
+
for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
|
| 639 |
fits = "✅" if vram >= effective_vram_needed else "❌"
|
| 640 |
headroom = vram - effective_vram_needed
|
| 641 |
headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
|
| 642 |
+
|
| 643 |
+
# Estimate throughput for this GPU
|
| 644 |
+
if show_throughput and vram >= effective_vram_needed:
|
| 645 |
+
throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
|
| 646 |
+
tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
|
| 647 |
+
else:
|
| 648 |
+
tok_str = "-"
|
| 649 |
+
|
| 650 |
+
results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
|
| 651 |
|
| 652 |
# Quantization options (if model doesn't fit on consumer GPUs)
|
| 653 |
if effective_vram_needed > 24:
|
|
|
|
| 664 |
|
| 665 |
results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
|
| 666 |
|
| 667 |
+
# Cost estimates for cloud GPUs
|
| 668 |
+
if show_cost:
|
| 669 |
+
cost_estimates = calculate_cost_estimate(effective_vram_needed)
|
| 670 |
+
if cost_estimates:
|
| 671 |
+
results.append(f"\n### 💰 Cloud Cost Estimates")
|
| 672 |
+
results.append("*Based on 8 hrs/day, 22 days/month*\n")
|
| 673 |
+
results.append("| GPU | Hourly | Daily | Monthly |")
|
| 674 |
+
results.append("|-----|--------|-------|---------|")
|
| 675 |
+
for est in cost_estimates[:5]: # Top 5 cheapest
|
| 676 |
+
results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
|
| 677 |
+
|
| 678 |
return "\n".join(results), chart_data
|
| 679 |
|
| 680 |
|
|
|
|
| 696 |
)
|
| 697 |
|
| 698 |
|
| 699 |
+
def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
|
| 700 |
+
"""Compare multiple models side by side."""
|
| 701 |
+
model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
|
| 702 |
+
|
| 703 |
+
if len(model_ids) < 2:
|
| 704 |
+
return "Please enter at least 2 model IDs (one per line)"
|
| 705 |
+
|
| 706 |
+
if len(model_ids) > 5:
|
| 707 |
+
return "Maximum 5 models for comparison"
|
| 708 |
+
|
| 709 |
+
results = ["## Model Comparison\n"]
|
| 710 |
+
comparison_data = []
|
| 711 |
+
|
| 712 |
+
for model_id in model_ids:
|
| 713 |
+
try:
|
| 714 |
+
info = get_model_info(model_id)
|
| 715 |
+
config = get_config(model_id)
|
| 716 |
+
param_count, dominant_dtype = estimate_params_from_safetensors(info)
|
| 717 |
+
|
| 718 |
+
if param_count == 0:
|
| 719 |
+
comparison_data.append({
|
| 720 |
+
"model": model_id,
|
| 721 |
+
"params": "N/A",
|
| 722 |
+
"error": "Could not determine parameters"
|
| 723 |
+
})
|
| 724 |
+
continue
|
| 725 |
+
|
| 726 |
+
dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
|
| 727 |
+
weights_gb = bytes_to_gb(param_count * dtype_bytes)
|
| 728 |
+
|
| 729 |
+
num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
|
| 730 |
+
num_kv_heads = config.get("num_key_value_heads",
|
| 731 |
+
config.get("num_attention_heads", 0))
|
| 732 |
+
head_dim = get_head_dim(config)
|
| 733 |
+
|
| 734 |
+
kv_bytes = estimate_kv_cache_size(
|
| 735 |
+
num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes
|
| 736 |
+
)
|
| 737 |
+
kv_gb = bytes_to_gb(kv_bytes)
|
| 738 |
+
total_inference = weights_gb + kv_gb
|
| 739 |
+
|
| 740 |
+
# Training estimate
|
| 741 |
+
training_mem = estimate_training_memory(param_count, dtype_bytes)
|
| 742 |
+
training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
|
| 743 |
+
|
| 744 |
+
# QLoRA estimate
|
| 745 |
+
qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
|
| 746 |
+
qlora_gb = bytes_to_gb(qlora_mem["total"])
|
| 747 |
+
|
| 748 |
+
comparison_data.append({
|
| 749 |
+
"model": model_id.split("/")[-1],
|
| 750 |
+
"full_id": model_id,
|
| 751 |
+
"params": f"{param_count/1e9:.1f}B",
|
| 752 |
+
"dtype": dominant_dtype,
|
| 753 |
+
"weights_gb": weights_gb,
|
| 754 |
+
"kv_gb": kv_gb,
|
| 755 |
+
"inference_gb": total_inference,
|
| 756 |
+
"training_gb": training_gb,
|
| 757 |
+
"qlora_gb": qlora_gb,
|
| 758 |
+
})
|
| 759 |
+
except Exception as e:
|
| 760 |
+
comparison_data.append({
|
| 761 |
+
"model": model_id,
|
| 762 |
+
"error": str(e)
|
| 763 |
+
})
|
| 764 |
+
|
| 765 |
+
# Build comparison table
|
| 766 |
+
results.append(f"*Context length: {context_length:,}*\n")
|
| 767 |
+
results.append("| Model | Params | Inference | Training | QLoRA |")
|
| 768 |
+
results.append("|-------|--------|-----------|----------|-------|")
|
| 769 |
+
|
| 770 |
+
for data in comparison_data:
|
| 771 |
+
if "error" in data:
|
| 772 |
+
results.append(f"| {data['model']} | Error | - | - | - |")
|
| 773 |
+
else:
|
| 774 |
+
results.append(
|
| 775 |
+
f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
|
| 776 |
+
f"{data['params']} | "
|
| 777 |
+
f"{data['inference_gb']:.1f} GB | "
|
| 778 |
+
f"{data['training_gb']:.1f} GB | "
|
| 779 |
+
f"{data['qlora_gb']:.1f} GB |"
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
# Find minimum for each category
|
| 783 |
+
valid_data = [d for d in comparison_data if "error" not in d]
|
| 784 |
+
if len(valid_data) >= 2:
|
| 785 |
+
results.append("\n### Recommendations")
|
| 786 |
+
|
| 787 |
+
min_inference = min(valid_data, key=lambda x: x["inference_gb"])
|
| 788 |
+
min_training = min(valid_data, key=lambda x: x["training_gb"])
|
| 789 |
+
min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
|
| 790 |
+
|
| 791 |
+
results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
|
| 792 |
+
results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
|
| 793 |
+
results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
|
| 794 |
+
|
| 795 |
+
return "\n".join(results)
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
def export_results(result_text: str, format_type: str) -> str:
|
| 799 |
+
"""Export results to different formats."""
|
| 800 |
+
if not result_text:
|
| 801 |
+
return "No results to export. Run a calculation first."
|
| 802 |
+
|
| 803 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 804 |
+
|
| 805 |
+
if format_type == "JSON":
|
| 806 |
+
# Parse markdown to create structured JSON
|
| 807 |
+
import re
|
| 808 |
+
lines = result_text.split("\n")
|
| 809 |
+
data = {
|
| 810 |
+
"timestamp": timestamp,
|
| 811 |
+
"raw_markdown": result_text,
|
| 812 |
+
"sections": {}
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
current_section = "header"
|
| 816 |
+
for line in lines:
|
| 817 |
+
if line.startswith("### "):
|
| 818 |
+
current_section = line.replace("### ", "").strip()
|
| 819 |
+
data["sections"][current_section] = []
|
| 820 |
+
elif line.strip():
|
| 821 |
+
if current_section not in data["sections"]:
|
| 822 |
+
data["sections"][current_section] = []
|
| 823 |
+
data["sections"][current_section].append(line.strip())
|
| 824 |
+
|
| 825 |
+
return json.dumps(data, indent=2)
|
| 826 |
+
|
| 827 |
+
else: # Plain text
|
| 828 |
+
# Convert markdown to plain text
|
| 829 |
+
plain = result_text
|
| 830 |
+
plain = plain.replace("**", "")
|
| 831 |
+
plain = plain.replace("###", "\n===")
|
| 832 |
+
plain = plain.replace("##", "\n===")
|
| 833 |
+
plain = f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
|
| 834 |
+
return plain
|
| 835 |
+
|
| 836 |
+
|
| 837 |
# Build Gradio interface
|
| 838 |
with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
|
| 839 |
gr.Markdown("""
|
| 840 |
+
# VRAM & Instance Type Calculator
|
| 841 |
|
| 842 |
+
Estimate GPU memory requirements for HuggingFace models. Supports inference, training, LoRA/QLoRA fine-tuning,
|
| 843 |
+
multi-GPU setups, model comparison, and detailed quantization recommendations.
|
| 844 |
""")
|
| 845 |
|
| 846 |
+
with gr.Tabs():
|
| 847 |
+
# === CALCULATOR TAB ===
|
| 848 |
+
with gr.TabItem("Calculator"):
|
| 849 |
+
with gr.Row():
|
| 850 |
+
with gr.Column(scale=2):
|
| 851 |
+
model_input = gr.Textbox(
|
| 852 |
+
label="Model ID",
|
| 853 |
+
placeholder="meta-llama/Llama-3.1-8B",
|
| 854 |
+
info="Full HuggingFace model ID (org/model-name)"
|
| 855 |
+
)
|
| 856 |
+
with gr.Column(scale=1):
|
| 857 |
+
search_input = gr.Textbox(
|
| 858 |
+
label="Search Models",
|
| 859 |
+
placeholder="llama 8b",
|
| 860 |
+
info="Search HuggingFace for models"
|
| 861 |
+
)
|
| 862 |
+
search_btn = gr.Button("Search", size="sm")
|
| 863 |
+
|
| 864 |
+
with gr.Row(visible=False) as search_results_row:
|
| 865 |
+
search_results = gr.Dropdown(
|
| 866 |
+
label="Search Results (click to select)",
|
| 867 |
+
choices=[],
|
| 868 |
+
interactive=True,
|
| 869 |
+
)
|
| 870 |
+
|
| 871 |
+
def do_search(query):
|
| 872 |
+
if not query:
|
| 873 |
+
return gr.update(visible=False), gr.update(choices=[])
|
| 874 |
+
results = search_models(query, limit=10)
|
| 875 |
+
if results:
|
| 876 |
+
return gr.update(visible=True), gr.update(choices=results, value=results[0])
|
| 877 |
+
return gr.update(visible=True), gr.update(choices=["No models found"], value=None)
|
| 878 |
+
|
| 879 |
+
def select_model(selected):
|
| 880 |
+
if selected and selected != "No models found":
|
| 881 |
+
return selected
|
| 882 |
+
return ""
|
| 883 |
+
|
| 884 |
+
search_btn.click(
|
| 885 |
+
fn=do_search,
|
| 886 |
+
inputs=[search_input],
|
| 887 |
+
outputs=[search_results_row, search_results]
|
| 888 |
+
)
|
| 889 |
+
search_results.change(
|
| 890 |
+
fn=select_model,
|
| 891 |
+
inputs=[search_results],
|
| 892 |
+
outputs=[model_input]
|
| 893 |
+
)
|
| 894 |
+
|
| 895 |
+
with gr.Row():
|
| 896 |
+
with gr.Column(scale=1):
|
| 897 |
+
mode_input = gr.Radio(
|
| 898 |
+
choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
|
| 899 |
+
value="Inference",
|
| 900 |
+
label="Mode",
|
| 901 |
+
info="LoRA/QLoRA use significantly less memory"
|
| 902 |
+
)
|
| 903 |
+
with gr.Column(scale=1):
|
| 904 |
+
context_input = gr.Slider(
|
| 905 |
+
label="Context Length",
|
| 906 |
+
minimum=512,
|
| 907 |
+
maximum=131072,
|
| 908 |
+
value=4096,
|
| 909 |
+
step=512,
|
| 910 |
+
info="Sequence length for KV cache"
|
| 911 |
+
)
|
| 912 |
+
with gr.Column(scale=1):
|
| 913 |
+
batch_input = gr.Slider(
|
| 914 |
+
label="Batch Size",
|
| 915 |
+
minimum=1,
|
| 916 |
+
maximum=64,
|
| 917 |
+
value=1,
|
| 918 |
+
step=1,
|
| 919 |
+
info="Concurrent sequences"
|
| 920 |
+
)
|
| 921 |
+
|
| 922 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 923 |
+
with gr.Row():
|
| 924 |
+
with gr.Column():
|
| 925 |
+
serving_input = gr.Dropdown(
|
| 926 |
+
choices=list(SERVING_FRAMEWORKS.keys()),
|
| 927 |
+
value="None (raw PyTorch)",
|
| 928 |
+
label="Serving Framework",
|
| 929 |
+
info="Different frameworks have different overhead"
|
| 930 |
+
)
|
| 931 |
+
optimizer_input = gr.Dropdown(
|
| 932 |
+
choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
|
| 933 |
+
value="AdamW",
|
| 934 |
+
label="Optimizer (Training mode)",
|
| 935 |
+
info="Optimizer state memory varies"
|
| 936 |
+
)
|
| 937 |
+
lora_rank_input = gr.Slider(
|
| 938 |
+
label="LoRA Rank",
|
| 939 |
+
minimum=4,
|
| 940 |
+
maximum=128,
|
| 941 |
+
value=16,
|
| 942 |
+
step=4,
|
| 943 |
+
info="Higher rank = more capacity but more memory"
|
| 944 |
+
)
|
| 945 |
+
with gr.Column():
|
| 946 |
+
num_gpus_input = gr.Slider(
|
| 947 |
+
label="Number of GPUs",
|
| 948 |
+
minimum=1,
|
| 949 |
+
maximum=8,
|
| 950 |
+
value=1,
|
| 951 |
+
step=1,
|
| 952 |
+
info="For multi-GPU setups"
|
| 953 |
+
)
|
| 954 |
+
parallelism_input = gr.Dropdown(
|
| 955 |
+
choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
|
| 956 |
+
value="Tensor Parallelism",
|
| 957 |
+
label="Parallelism Strategy",
|
| 958 |
+
info="How to distribute across GPUs"
|
| 959 |
+
)
|
| 960 |
+
flash_attention_input = gr.Checkbox(
|
| 961 |
+
label="Use Flash Attention",
|
| 962 |
+
value=True,
|
| 963 |
+
info="Reduces peak memory usage"
|
| 964 |
+
)
|
| 965 |
+
with gr.Row():
|
| 966 |
+
show_throughput_input = gr.Checkbox(
|
| 967 |
+
label="Show Throughput Estimates",
|
| 968 |
+
value=True,
|
| 969 |
+
info="Estimated tokens/sec per GPU"
|
| 970 |
+
)
|
| 971 |
+
show_cost_input = gr.Checkbox(
|
| 972 |
+
label="Show Cost Estimates",
|
| 973 |
+
value=True,
|
| 974 |
+
info="Cloud GPU hourly/monthly costs"
|
| 975 |
+
)
|
| 976 |
+
|
| 977 |
+
calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
|
| 978 |
+
|
| 979 |
+
with gr.Row():
|
| 980 |
+
with gr.Column(scale=3):
|
| 981 |
+
output = gr.Markdown(label="Results")
|
| 982 |
+
with gr.Column(scale=1):
|
| 983 |
+
chart_output = gr.BarPlot(
|
| 984 |
+
x="Component",
|
| 985 |
+
y="GB",
|
| 986 |
+
title="Memory Breakdown",
|
| 987 |
+
height=350,
|
| 988 |
+
)
|
| 989 |
+
|
| 990 |
+
def run_calculation(
|
| 991 |
+
model_id, context_length, batch_size, mode, optimizer, serving,
|
| 992 |
+
num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
|
| 993 |
+
):
|
| 994 |
+
result_text, chart_data = calculate_vram(
|
| 995 |
+
model_id, context_length, batch_size, mode, optimizer, serving,
|
| 996 |
+
num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
|
| 997 |
+
)
|
| 998 |
+
if chart_data:
|
| 999 |
+
import pandas as pd
|
| 1000 |
+
df = pd.DataFrame({
|
| 1001 |
+
"Component": list(chart_data.keys()),
|
| 1002 |
+
"GB": list(chart_data.values())
|
| 1003 |
+
})
|
| 1004 |
+
return result_text, df
|
| 1005 |
+
return result_text, None
|
| 1006 |
+
|
| 1007 |
+
calculate_btn.click(
|
| 1008 |
+
fn=run_calculation,
|
| 1009 |
+
inputs=[
|
| 1010 |
+
model_input, context_input, batch_input, mode_input,
|
| 1011 |
+
optimizer_input, serving_input, num_gpus_input, parallelism_input,
|
| 1012 |
+
flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
|
| 1013 |
+
],
|
| 1014 |
+
outputs=[output, chart_output]
|
| 1015 |
)
|
| 1016 |
|
| 1017 |
+
# Examples
|
| 1018 |
+
gr.Examples(
|
| 1019 |
+
examples=[
|
| 1020 |
+
["meta-llama/Llama-3.1-8B", 4096, 1],
|
| 1021 |
+
["meta-llama/Llama-3.1-70B", 8192, 1],
|
| 1022 |
+
["mistralai/Mistral-7B-v0.1", 8192, 1],
|
| 1023 |
+
["Qwen/Qwen2.5-72B", 32768, 1],
|
| 1024 |
+
["google/gemma-2-27b", 8192, 1],
|
| 1025 |
+
["microsoft/phi-4", 16384, 1],
|
| 1026 |
+
["deepseek-ai/DeepSeek-V3", 4096, 1],
|
| 1027 |
+
["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
|
| 1028 |
+
],
|
| 1029 |
+
inputs=[model_input, context_input, batch_input],
|
| 1030 |
+
label="Popular Models"
|
| 1031 |
)
|
| 1032 |
+
|
| 1033 |
+
# === COMPARE TAB ===
|
| 1034 |
+
with gr.TabItem("Compare Models"):
|
| 1035 |
+
gr.Markdown("""
|
| 1036 |
+
Compare VRAM requirements across multiple models side-by-side.
|
| 1037 |
+
Enter model IDs one per line (2-5 models).
|
| 1038 |
+
""")
|
| 1039 |
+
|
| 1040 |
+
compare_models_input = gr.Textbox(
|
| 1041 |
+
label="Model IDs (one per line)",
|
| 1042 |
+
placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
|
| 1043 |
+
lines=5,
|
| 1044 |
+
)
|
| 1045 |
+
compare_context_input = gr.Slider(
|
| 1046 |
label="Context Length",
|
| 1047 |
minimum=512,
|
| 1048 |
maximum=131072,
|
| 1049 |
value=4096,
|
| 1050 |
step=512,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
)
|
| 1052 |
+
compare_btn = gr.Button("Compare Models", variant="primary")
|
| 1053 |
+
compare_output = gr.Markdown(label="Comparison Results")
|
| 1054 |
|
| 1055 |
+
compare_btn.click(
|
| 1056 |
+
fn=compare_models,
|
| 1057 |
+
inputs=[compare_models_input, compare_context_input],
|
| 1058 |
+
outputs=compare_output
|
| 1059 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1060 |
|
| 1061 |
+
gr.Examples(
|
| 1062 |
+
examples=[
|
| 1063 |
+
["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
|
| 1064 |
+
["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B\nmeta-llama/Llama-3.3-70B-Instruct", 8192],
|
| 1065 |
+
],
|
| 1066 |
+
inputs=[compare_models_input, compare_context_input],
|
| 1067 |
+
label="Example Comparisons"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
)
|
| 1069 |
|
| 1070 |
+
# === EXPORT TAB ===
|
| 1071 |
+
with gr.TabItem("Export"):
|
| 1072 |
+
gr.Markdown("""
|
| 1073 |
+
Export your calculation results to JSON or plain text format.
|
| 1074 |
+
First run a calculation in the Calculator tab, then copy the results here.
|
| 1075 |
+
""")
|
| 1076 |
+
|
| 1077 |
+
export_input = gr.Textbox(
|
| 1078 |
+
label="Paste Results Here",
|
| 1079 |
+
placeholder="Paste the calculation results from the Calculator tab...",
|
| 1080 |
+
lines=10,
|
| 1081 |
+
)
|
| 1082 |
+
export_format = gr.Radio(
|
| 1083 |
+
choices=["JSON", "Plain Text"],
|
| 1084 |
+
value="JSON",
|
| 1085 |
+
label="Export Format"
|
| 1086 |
+
)
|
| 1087 |
+
export_btn = gr.Button("Export", variant="primary")
|
| 1088 |
+
export_output = gr.Textbox(
|
| 1089 |
+
label="Exported Data",
|
| 1090 |
+
lines=15,
|
| 1091 |
+
show_copy_button=True,
|
| 1092 |
+
)
|
| 1093 |
|
| 1094 |
+
export_btn.click(
|
| 1095 |
+
fn=export_results,
|
| 1096 |
+
inputs=[export_input, export_format],
|
| 1097 |
+
outputs=export_output
|
| 1098 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1099 |
|
| 1100 |
+
# Notes outside tabs
|
| 1101 |
gr.Markdown("""
|
| 1102 |
---
|
| 1103 |
+
### Notes
|
| 1104 |
- **Inference mode:** Weights + KV cache + framework overhead
|
| 1105 |
+
- **Training modes:** Full training, LoRA, and QLoRA with different memory profiles
|
| 1106 |
- **KV cache:** Scales linearly with context length and batch size
|
| 1107 |
- **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
|
| 1108 |
- **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
|
| 1109 |
|
| 1110 |
+
### Disclaimers
|
| 1111 |
- Estimates are approximate; actual usage varies by implementation
|
| 1112 |
+
- Flash Attention and other optimizations can reduce peak memory
|
| 1113 |
+
- Throughput estimates assume ideal conditions
|
| 1114 |
|
| 1115 |
+
Built with Gradio & HuggingFace Hub API
|
| 1116 |
""")
|
| 1117 |
|
| 1118 |
|