Spaces:

Livengood
/

Instance-VRAM-Calculator

Running

Livengood Claude commited on 12 days ago

Commit

26bc78c

1 Parent(s): 50bc6be

Add LoRA/QLoRA modes, model comparison, search, throughput, cost estimates, and export

New features:
- LoRA and QLoRA fine-tuning memory estimation modes
- Model comparison tab for side-by-side VRAM analysis
- Model search with HuggingFace API integration
- Throughput estimation (tokens/sec) per GPU
- Cloud cost estimates (hourly/daily/monthly)
- Flash Attention toggle with memory savings display
- Export results to JSON or plain text
- Tabbed interface (Calculator, Compare, Export)

Updated GPU specs with TFLOPs and hourly costs for cloud instances.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

.gitignore +2 -0
app.py +674 -136

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/*
2	+ .claude/*

app.py CHANGED Viewed

@@ -7,40 +7,45 @@ Fetches model metadata from HF Hub and calculates:
 - Recommended GPUs and cloud instances
 - Multi-GPU tensor parallelism estimates
 - Quantization options with detailed breakdown
 """
 import gradio as gr
-from huggingface_hub import HfApi, hf_hub_download
 import json
 from functools import lru_cache
 # Initialize HF API client
 api = HfApi()
-# GPU specs: name -> (VRAM in GB, typical cloud instance, category)
 GPU_SPECS = {
     # Consumer GPUs
-    "RTX 3080": (10, "Consumer", "consumer"),
-    "RTX 3090": (24, "Consumer", "consumer"),
-    "RTX 4080": (16, "Consumer", "consumer"),
-    "RTX 4090": (24, "Consumer", "consumer"),
-    "RTX 5090": (32, "Consumer (est.)", "consumer"),
     # Apple Silicon
-    "M2 Ultra": (192, "Mac Studio (Unified)", "apple"),
-    "M3 Max": (128, "MacBook Pro (Unified)", "apple"),
-    "M4 Max": (128, "MacBook Pro (Unified)", "apple"),
     # Workstation GPUs
-    "RTX A6000": (48, "Workstation", "workstation"),
-    "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud"),
     # Cloud GPUs
-    "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud"),
-    "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud"),
-    "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud"),
-    "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud"),
-    "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud"),
-    "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud"),
     # AMD GPUs
-    "MI300X": (192, "AMD Cloud Instances", "amd"),
 }
 # Bytes per element for different dtypes
@@ -236,6 +241,175 @@ def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism:
         }
 def calculate_vram(
     model_id: str,
     context_length: int = 4096,
@@ -244,7 +418,11 @@ def calculate_vram(
     optimizer: str = "AdamW",
     serving_framework: str = "None (raw PyTorch)",
     num_gpus: int = 1,
-    parallelism: str = "Tensor Parallelism"
 ) -> tuple[str, dict | None]:
     """Main calculation function. Returns (markdown_results, chart_data)."""
@@ -331,13 +509,25 @@ def calculate_vram(
         results.append("Could not find architecture details")
         kv_gb = 0
     # Calculate total based on mode
-    if mode == "Training":
         training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
         base_gb = bytes_to_gb(training_mem["total_base"])
         # Activations estimation (rough: ~2x weights for typical batch)
         activation_gb = weights_gb * 2 * batch_size
         total_gb = base_gb + kv_gb + activation_gb
         results.append(f"\n### 🎓 Training Memory Breakdown")
@@ -354,11 +544,58 @@ def calculate_vram(
             "KV Cache": kv_gb,
             "Activations": activation_gb,
         }
     else:
         # Inference mode
         framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
         base_total = weights_gb + kv_gb
         overhead_gb = base_total * (framework_overhead - 1)
         total_gb = base_total + overhead_gb
         results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
@@ -372,6 +609,12 @@ def calculate_vram(
             "Overhead": overhead_gb,
         }
     results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
     # Multi-GPU calculations
@@ -389,14 +632,22 @@ def calculate_vram(
     # GPU Recommendations
     results.append(f"\n### 🎮 GPU Recommendations")
-    results.append("| GPU | VRAM | Fits? | Headroom | Instance |")
-    results.append("|-----|------|-------|----------|----------|")
-    for gpu_name, (vram, instance, category) in GPU_SPECS.items():
         fits = "✅" if vram >= effective_vram_needed else "❌"
         headroom = vram - effective_vram_needed
         headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
-        results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {instance} |")
     # Quantization options (if model doesn't fit on consumer GPUs)
     if effective_vram_needed > 24:
@@ -413,6 +664,17 @@ def calculate_vram(
         results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
     return "\n".join(results), chart_data
@@ -434,147 +696,423 @@ def create_memory_chart(chart_data: dict | None):
     )
 # Build Gradio interface
 with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🧮 VRAM & Instance Type Calculator
-    Estimate GPU memory requirements for HuggingFace models. Supports inference and training modes,
-    multi-GPU setups, and provides detailed quantization recommendations.
     """)
-    with gr.Row():
-        with gr.Column(scale=2):
-            model_input = gr.Textbox(
-                label="Model ID",
-                placeholder="meta-llama/Llama-3.1-8B",
-                info="Full HuggingFace model ID (org/model-name)"
             )
-    with gr.Row():
-        with gr.Column(scale=1):
-            mode_input = gr.Radio(
-                choices=["Inference", "Training"],
-                value="Inference",
-                label="Mode",
-                info="Training requires ~4x more memory"
             )
-        with gr.Column(scale=1):
-            context_input = gr.Slider(
                 label="Context Length",
                 minimum=512,
                 maximum=131072,
                 value=4096,
                 step=512,
-                info="Sequence length for KV cache"
-            )
-        with gr.Column(scale=1):
-            batch_input = gr.Slider(
-                label="Batch Size",
-                minimum=1,
-                maximum=64,
-                value=1,
-                step=1,
-                info="Concurrent sequences"
             )
-    with gr.Accordion("⚙️ Advanced Options", open=False):
-        with gr.Row():
-            with gr.Column():
-                serving_input = gr.Dropdown(
-                    choices=list(SERVING_FRAMEWORKS.keys()),
-                    value="None (raw PyTorch)",
-                    label="Serving Framework",
-                    info="Different frameworks have different overhead"
-                )
-                optimizer_input = gr.Dropdown(
-                    choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
-                    value="AdamW",
-                    label="Optimizer (Training mode)",
-                    info="Optimizer state memory varies"
-                )
-            with gr.Column():
-                num_gpus_input = gr.Slider(
-                    label="Number of GPUs",
-                    minimum=1,
-                    maximum=8,
-                    value=1,
-                    step=1,
-                    info="For multi-GPU setups"
-                )
-                parallelism_input = gr.Dropdown(
-                    choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
-                    value="Tensor Parallelism",
-                    label="Parallelism Strategy",
-                    info="How to distribute across GPUs"
-                )
-    calculate_btn = gr.Button("🚀 Calculate VRAM", variant="primary", size="lg")
-    with gr.Row():
-        with gr.Column(scale=3):
-            output = gr.Markdown(label="Results")
-        with gr.Column(scale=1):
-            chart_output = gr.BarPlot(
-                x="Component",
-                y="GB",
-                title="Memory Breakdown",
-                height=350,
             )
-    def run_calculation(model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism):
-        result_text, chart_data = calculate_vram(
-            model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism
-        )
-        if chart_data:
-            import pandas as pd
-            df = pd.DataFrame({
-                "Component": list(chart_data.keys()),
-                "GB": list(chart_data.values())
-            })
-            return result_text, df
-        return result_text, None
-    calculate_btn.click(
-        fn=run_calculation,
-        inputs=[
-            model_input, context_input, batch_input, mode_input,
-            optimizer_input, serving_input, num_gpus_input, parallelism_input
-        ],
-        outputs=[output, chart_output]
-    )
-    # Examples
-    gr.Examples(
-        examples=[
-            ["meta-llama/Llama-3.1-8B", 4096, 1],
-            ["meta-llama/Llama-3.1-70B", 8192, 1],
-            ["mistralai/Mistral-7B-v0.1", 8192, 1],
-            ["Qwen/Qwen2.5-72B", 32768, 1],
-            ["google/gemma-2-27b", 8192, 1],
-            ["microsoft/phi-4", 16384, 1],
-            ["deepseek-ai/DeepSeek-V3", 4096, 1],
-            ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
-        ],
-        inputs=[model_input, context_input, batch_input],
-        label="🔥 Popular Models"
-    )
     gr.Markdown("""
     ---
-    ### 📝 Notes
     - **Inference mode:** Weights + KV cache + framework overhead
-    - **Training mode:** Adds gradients, optimizer states, and activation memory
     - **KV cache:** Scales linearly with context length and batch size
     - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
     - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
-    ### ⚠️ Disclaimers
     - Estimates are approximate; actual usage varies by implementation
-    - Flash Attention and other optimizations can significantly reduce memory
-    - GGUF models have different memory profiles than safetensors
-    Built with 💜 using Gradio & HuggingFace Hub API
     """)

 - Recommended GPUs and cloud instances
 - Multi-GPU tensor parallelism estimates
 - Quantization options with detailed breakdown
+- Model comparison across multiple models
+- Throughput estimation
+- Cloud cost analysis
+- LoRA/QLoRA fine-tuning memory requirements
 """
 import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download, list_models
 import json
 from functools import lru_cache
+from datetime import datetime
 # Initialize HF API client
 api = HfApi()
+# GPU specs: name -> (VRAM in GB, typical cloud instance, category, hourly_cost, tflops_fp16)
 GPU_SPECS = {
     # Consumer GPUs
+    "RTX 3080": (10, "Consumer", "consumer", 0, 29.8),
+    "RTX 3090": (24, "Consumer", "consumer", 0, 35.6),
+    "RTX 4080": (16, "Consumer", "consumer", 0, 48.7),
+    "RTX 4090": (24, "Consumer", "consumer", 0, 82.6),
+    "RTX 5090": (32, "Consumer (est.)", "consumer", 0, 105.0),
     # Apple Silicon
+    "M2 Ultra": (192, "Mac Studio (Unified)", "apple", 0, 27.2),
+    "M3 Max": (128, "MacBook Pro (Unified)", "apple", 0, 14.2),
+    "M4 Max": (128, "MacBook Pro (Unified)", "apple", 0, 18.0),
     # Workstation GPUs
+    "RTX A6000": (48, "Workstation", "workstation", 0, 38.7),
+    "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud", 1.00, 91.6),
     # Cloud GPUs
+    "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud", 1.00, 31.2),
+    "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud", 0.70, 30.3),
+    "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud", 3.00, 77.9),
+    "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud", 5.00, 77.9),
+    "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud", 8.00, 267.6),
+    "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud", 12.00, 296.0),
     # AMD GPUs
+    "MI300X": (192, "AMD Cloud Instances", "amd", 6.00, 383.0),
 }
 # Bytes per element for different dtypes
         }
+def estimate_lora_memory(
+    param_count: int,
+    dtype_bytes: int,
+    lora_rank: int = 16,
+    lora_alpha: int = 32,
+    target_modules: int = 4,
+    use_qlora: bool = False
+) -> dict:
+    """
+    Estimate LoRA/QLoRA fine-tuning memory requirements.
+    LoRA adds low-rank adaptation matrices to specific layers.
+    QLoRA additionally quantizes the base model to 4-bit.
+    """
+    # Base model weights
+    if use_qlora:
+        # QLoRA: 4-bit quantized weights
+        base_weights_bytes = param_count * 0.5  # 4-bit = 0.5 bytes/param
+    else:
+        base_weights_bytes = param_count * dtype_bytes
+    # LoRA adapter parameters (A and B matrices for each target module)
+    # Typical target modules: q_proj, k_proj, v_proj, o_proj (4 modules)
+    # Each LoRA layer: hidden_size * rank (A) + rank * hidden_size (B)
+    # Approximate as 2 * hidden_size * rank per module
+    # For simplicity, estimate based on total params
+    lora_params_ratio = (lora_rank * 2 * target_modules) / 1000  # Rough estimate
+    lora_params = int(param_count * lora_params_ratio * 0.01)  # Usually ~0.1-1% of base
+    lora_weights_bytes = lora_params * dtype_bytes
+    # Gradients only for LoRA params (not frozen base)
+    gradients_bytes = lora_params * dtype_bytes
+    # Optimizer states for LoRA params only
+    optimizer_bytes = lora_params * 4 * 2  # AdamW: 2 states, 4 bytes each
+    # Activations (still needed, but can use gradient checkpointing)
+    activation_bytes = base_weights_bytes * 0.5  # Reduced with checkpointing
+    return {
+        "base_weights": base_weights_bytes,
+        "lora_weights": lora_weights_bytes,
+        "lora_params": lora_params,
+        "gradients": gradients_bytes,
+        "optimizer": optimizer_bytes,
+        "activations": activation_bytes,
+        "total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
+        "vs_full_finetune_ratio": 0.3 if use_qlora else 0.5,  # Rough memory savings
+    }
+def estimate_throughput(
+    param_count: int,
+    gpu_tflops: float,
+    batch_size: int = 1,
+    context_length: int = 4096,
+    is_prefill: bool = False
+) -> dict:
+    """
+    Estimate tokens per second throughput.
+    Based on roofline model: throughput limited by compute or memory bandwidth.
+    Most LLM inference is memory-bound for single-batch decode.
+    """
+    # Rough estimate: 2 FLOPs per parameter per token (forward pass)
+    flops_per_token = 2 * param_count
+    # Peak theoretical throughput (compute-bound)
+    peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
+    # Memory-bound estimate (more realistic for decode)
+    # Assume ~1TB/s memory bandwidth for modern GPUs
+    memory_bandwidth_tbs = 1.0  # TB/s, rough average
+    bytes_per_token = param_count * 2  # FP16 weights need to be read
+    memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
+    # Prefill is more compute-bound, decode is memory-bound
+    if is_prefill:
+        effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
+    else:
+        effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
+    # Apply realistic efficiency factor (typically 30-60% of theoretical)
+    efficiency = 0.4
+    realistic_tokens = effective_tokens * efficiency
+    return {
+        "peak_theoretical": peak_tokens_per_sec,
+        "memory_bound": memory_bound_tokens,
+        "estimated_tokens_per_sec": realistic_tokens,
+        "batch_size": batch_size,
+        "is_prefill": is_prefill,
+    }
+def calculate_cost_estimate(
+    vram_required: float,
+    hours_per_day: float = 8,
+    days_per_month: float = 22
+) -> list:
+    """Calculate cost estimates for cloud GPUs that fit the model."""
+    estimates = []
+    for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
+        if vram >= vram_required and hourly_cost > 0:
+            daily_cost = hourly_cost * hours_per_day
+            monthly_cost = daily_cost * days_per_month
+            estimates.append({
+                "gpu": gpu_name,
+                "vram": vram,
+                "hourly": hourly_cost,
+                "daily": daily_cost,
+                "monthly": monthly_cost,
+                "instance": instance,
+            })
+    return sorted(estimates, key=lambda x: x["hourly"])
+def search_models(query: str, limit: int = 10) -> list:
+    """Search HuggingFace models by name."""
+    if not query or len(query) < 2:
+        return []
+    try:
+        models = list(list_models(
+            search=query,
+            sort="downloads",
+            direction=-1,
+            limit=limit,
+            filter="text-generation"
+        ))
+        return [m.id for m in models]
+    except Exception:
+        return []
+def calculate_flash_attention_savings(
+    kv_cache_bytes: int,
+    context_length: int
+) -> dict:
+    """
+    Estimate memory savings from Flash Attention.
+    Flash Attention uses tiling to reduce memory from O(n^2) to O(n).
+    """
+    # Standard attention materializes full attention matrix
+    # Flash Attention streams through, never materializing full matrix
+    # Savings primarily in activation memory, not KV cache
+    # KV cache itself is O(n), so Flash Attention doesn't reduce it
+    # But it dramatically reduces peak memory during computation
+    # Estimate: Flash Attention reduces peak memory by avoiding
+    # the O(n^2) attention matrix materialization
+    standard_attention_overhead = context_length * context_length * 2  # FP16
+    flash_attention_overhead = context_length * 128 * 2  # Block size overhead
+    savings_bytes = standard_attention_overhead - flash_attention_overhead
+    savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
+    return {
+        "standard_overhead_gb": bytes_to_gb(standard_attention_overhead),
+        "flash_overhead_gb": bytes_to_gb(flash_attention_overhead),
+        "savings_gb": bytes_to_gb(savings_bytes),
+        "savings_percent": savings_ratio * 100,
+    }
 def calculate_vram(
     model_id: str,
     context_length: int = 4096,
     optimizer: str = "AdamW",
     serving_framework: str = "None (raw PyTorch)",
     num_gpus: int = 1,
+    parallelism: str = "Tensor Parallelism",
+    use_flash_attention: bool = True,
+    lora_rank: int = 16,
+    show_throughput: bool = True,
+    show_cost: bool = True
 ) -> tuple[str, dict | None]:
     """Main calculation function. Returns (markdown_results, chart_data)."""
         results.append("Could not find architecture details")
         kv_gb = 0
+    # Flash Attention savings
+    flash_savings = None
+    if use_flash_attention and kv_gb > 0:
+        kv_bytes = estimate_kv_cache_size(
+            num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
+        )
+        flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
     # Calculate total based on mode
+    if mode == "Training (Full)":
         training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
         base_gb = bytes_to_gb(training_mem["total_base"])
         # Activations estimation (rough: ~2x weights for typical batch)
         activation_gb = weights_gb * 2 * batch_size
+        if use_flash_attention and flash_savings:
+            activation_gb -= flash_savings["savings_gb"]
+            activation_gb = max(0.1, activation_gb)
         total_gb = base_gb + kv_gb + activation_gb
         results.append(f"\n### 🎓 Training Memory Breakdown")
             "KV Cache": kv_gb,
             "Activations": activation_gb,
         }
+    elif mode == "LoRA Fine-tuning":
+        lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
+        total_gb = bytes_to_gb(lora_mem["total"])
+        results.append(f"\n### 🔧 LoRA Fine-tuning (rank={lora_rank})")
+        results.append(f"- **Base weights (frozen):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
+        results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
+        results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
+        results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
+        results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
+        results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
+        chart_data = {
+            "Base Weights": bytes_to_gb(lora_mem['base_weights']),
+            "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
+            "Gradients": bytes_to_gb(lora_mem['gradients']),
+            "Optimizer": bytes_to_gb(lora_mem['optimizer']),
+            "Activations": bytes_to_gb(lora_mem['activations']),
+        }
+    elif mode == "QLoRA Fine-tuning":
+        lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
+        total_gb = bytes_to_gb(lora_mem["total"])
+        results.append(f"\n### 🔧 QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
+        results.append(f"- **Base weights (4-bit):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
+        results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
+        results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
+        results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
+        results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
+        results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
+        chart_data = {
+            "Base (4-bit)": bytes_to_gb(lora_mem['base_weights']),
+            "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
+            "Gradients": bytes_to_gb(lora_mem['gradients']),
+            "Optimizer": bytes_to_gb(lora_mem['optimizer']),
+            "Activations": bytes_to_gb(lora_mem['activations']),
+        }
     else:
         # Inference mode
         framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
         base_total = weights_gb + kv_gb
         overhead_gb = base_total * (framework_overhead - 1)
+        # Flash Attention reduces activation memory overhead during inference
+        if use_flash_attention and flash_savings:
+            overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
+            overhead_gb = max(0, overhead_gb)
         total_gb = base_total + overhead_gb
         results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
             "Overhead": overhead_gb,
         }
+    # Flash Attention info
+    if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
+        results.append(f"\n### ⚡ Flash Attention")
+        results.append(f"- **Enabled:** Yes")
+        results.append(f"- **Peak memory savings:** ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
     results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
     # Multi-GPU calculations
     # GPU Recommendations
     results.append(f"\n### 🎮 GPU Recommendations")
+    results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
+    results.append("|-----|------|-------|----------|------------|----------|")
+    for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
         fits = "✅" if vram >= effective_vram_needed else "❌"
         headroom = vram - effective_vram_needed
         headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
+        # Estimate throughput for this GPU
+        if show_throughput and vram >= effective_vram_needed:
+            throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
+            tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
+        else:
+            tok_str = "-"
+        results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
     # Quantization options (if model doesn't fit on consumer GPUs)
     if effective_vram_needed > 24:
         results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
+    # Cost estimates for cloud GPUs
+    if show_cost:
+        cost_estimates = calculate_cost_estimate(effective_vram_needed)
+        if cost_estimates:
+            results.append(f"\n### 💰 Cloud Cost Estimates")
+            results.append("*Based on 8 hrs/day, 22 days/month*\n")
+            results.append("| GPU | Hourly | Daily | Monthly |")
+            results.append("|-----|--------|-------|---------|")
+            for est in cost_estimates[:5]:  # Top 5 cheapest
+                results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
     return "\n".join(results), chart_data
     )
+def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
+    """Compare multiple models side by side."""
+    model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
+    if len(model_ids) < 2:
+        return "Please enter at least 2 model IDs (one per line)"
+    if len(model_ids) > 5:
+        return "Maximum 5 models for comparison"
+    results = ["## Model Comparison\n"]
+    comparison_data = []
+    for model_id in model_ids:
+        try:
+            info = get_model_info(model_id)
+            config = get_config(model_id)
+            param_count, dominant_dtype = estimate_params_from_safetensors(info)
+            if param_count == 0:
+                comparison_data.append({
+                    "model": model_id,
+                    "params": "N/A",
+                    "error": "Could not determine parameters"
+                })
+                continue
+            dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
+            weights_gb = bytes_to_gb(param_count * dtype_bytes)
+            num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
+            num_kv_heads = config.get("num_key_value_heads",
+                                       config.get("num_attention_heads", 0))
+            head_dim = get_head_dim(config)
+            kv_bytes = estimate_kv_cache_size(
+                num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes
+            )
+            kv_gb = bytes_to_gb(kv_bytes)
+            total_inference = weights_gb + kv_gb
+            # Training estimate
+            training_mem = estimate_training_memory(param_count, dtype_bytes)
+            training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
+            # QLoRA estimate
+            qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
+            qlora_gb = bytes_to_gb(qlora_mem["total"])
+            comparison_data.append({
+                "model": model_id.split("/")[-1],
+                "full_id": model_id,
+                "params": f"{param_count/1e9:.1f}B",
+                "dtype": dominant_dtype,
+                "weights_gb": weights_gb,
+                "kv_gb": kv_gb,
+                "inference_gb": total_inference,
+                "training_gb": training_gb,
+                "qlora_gb": qlora_gb,
+            })
+        except Exception as e:
+            comparison_data.append({
+                "model": model_id,
+                "error": str(e)
+            })
+    # Build comparison table
+    results.append(f"*Context length: {context_length:,}*\n")
+    results.append("| Model | Params | Inference | Training | QLoRA |")
+    results.append("|-------|--------|-----------|----------|-------|")
+    for data in comparison_data:
+        if "error" in data:
+            results.append(f"| {data['model']} | Error | - | - | - |")
+        else:
+            results.append(
+                f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
+                f"{data['params']} | "
+                f"{data['inference_gb']:.1f} GB | "
+                f"{data['training_gb']:.1f} GB | "
+                f"{data['qlora_gb']:.1f} GB |"
+            )
+    # Find minimum for each category
+    valid_data = [d for d in comparison_data if "error" not in d]
+    if len(valid_data) >= 2:
+        results.append("\n### Recommendations")
+        min_inference = min(valid_data, key=lambda x: x["inference_gb"])
+        min_training = min(valid_data, key=lambda x: x["training_gb"])
+        min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
+        results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
+        results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
+        results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
+    return "\n".join(results)
+def export_results(result_text: str, format_type: str) -> str:
+    """Export results to different formats."""
+    if not result_text:
+        return "No results to export. Run a calculation first."
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if format_type == "JSON":
+        # Parse markdown to create structured JSON
+        import re
+        lines = result_text.split("\n")
+        data = {
+            "timestamp": timestamp,
+            "raw_markdown": result_text,
+            "sections": {}
+        }
+        current_section = "header"
+        for line in lines:
+            if line.startswith("### "):
+                current_section = line.replace("### ", "").strip()
+                data["sections"][current_section] = []
+            elif line.strip():
+                if current_section not in data["sections"]:
+                    data["sections"][current_section] = []
+                data["sections"][current_section].append(line.strip())
+        return json.dumps(data, indent=2)
+    else:  # Plain text
+        # Convert markdown to plain text
+        plain = result_text
+        plain = plain.replace("**", "")
+        plain = plain.replace("###", "\n===")
+        plain = plain.replace("##", "\n===")
+        plain = f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
+        return plain
 # Build Gradio interface
 with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # VRAM & Instance Type Calculator
+    Estimate GPU memory requirements for HuggingFace models. Supports inference, training, LoRA/QLoRA fine-tuning,
+    multi-GPU setups, model comparison, and detailed quantization recommendations.
     """)
+    with gr.Tabs():
+        # === CALCULATOR TAB ===
+        with gr.TabItem("Calculator"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_input = gr.Textbox(
+                        label="Model ID",
+                        placeholder="meta-llama/Llama-3.1-8B",
+                        info="Full HuggingFace model ID (org/model-name)"
+                    )
+                with gr.Column(scale=1):
+                    search_input = gr.Textbox(
+                        label="Search Models",
+                        placeholder="llama 8b",
+                        info="Search HuggingFace for models"
+                    )
+                    search_btn = gr.Button("Search", size="sm")
+            with gr.Row(visible=False) as search_results_row:
+                search_results = gr.Dropdown(
+                    label="Search Results (click to select)",
+                    choices=[],
+                    interactive=True,
+                )
+            def do_search(query):
+                if not query:
+                    return gr.update(visible=False), gr.update(choices=[])
+                results = search_models(query, limit=10)
+                if results:
+                    return gr.update(visible=True), gr.update(choices=results, value=results[0])
+                return gr.update(visible=True), gr.update(choices=["No models found"], value=None)
+            def select_model(selected):
+                if selected and selected != "No models found":
+                    return selected
+                return ""
+            search_btn.click(
+                fn=do_search,
+                inputs=[search_input],
+                outputs=[search_results_row, search_results]
+            )
+            search_results.change(
+                fn=select_model,
+                inputs=[search_results],
+                outputs=[model_input]
+            )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    mode_input = gr.Radio(
+                        choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
+                        value="Inference",
+                        label="Mode",
+                        info="LoRA/QLoRA use significantly less memory"
+                    )
+                with gr.Column(scale=1):
+                    context_input = gr.Slider(
+                        label="Context Length",
+                        minimum=512,
+                        maximum=131072,
+                        value=4096,
+                        step=512,
+                        info="Sequence length for KV cache"
+                    )
+                with gr.Column(scale=1):
+                    batch_input = gr.Slider(
+                        label="Batch Size",
+                        minimum=1,
+                        maximum=64,
+                        value=1,
+                        step=1,
+                        info="Concurrent sequences"
+                    )
+            with gr.Accordion("Advanced Options", open=False):
+                with gr.Row():
+                    with gr.Column():
+                        serving_input = gr.Dropdown(
+                            choices=list(SERVING_FRAMEWORKS.keys()),
+                            value="None (raw PyTorch)",
+                            label="Serving Framework",
+                            info="Different frameworks have different overhead"
+                        )
+                        optimizer_input = gr.Dropdown(
+                            choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
+                            value="AdamW",
+                            label="Optimizer (Training mode)",
+                            info="Optimizer state memory varies"
+                        )
+                        lora_rank_input = gr.Slider(
+                            label="LoRA Rank",
+                            minimum=4,
+                            maximum=128,
+                            value=16,
+                            step=4,
+                            info="Higher rank = more capacity but more memory"
+                        )
+                    with gr.Column():
+                        num_gpus_input = gr.Slider(
+                            label="Number of GPUs",
+                            minimum=1,
+                            maximum=8,
+                            value=1,
+                            step=1,
+                            info="For multi-GPU setups"
+                        )
+                        parallelism_input = gr.Dropdown(
+                            choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
+                            value="Tensor Parallelism",
+                            label="Parallelism Strategy",
+                            info="How to distribute across GPUs"
+                        )
+                        flash_attention_input = gr.Checkbox(
+                            label="Use Flash Attention",
+                            value=True,
+                            info="Reduces peak memory usage"
+                        )
+                with gr.Row():
+                    show_throughput_input = gr.Checkbox(
+                        label="Show Throughput Estimates",
+                        value=True,
+                        info="Estimated tokens/sec per GPU"
+                    )
+                    show_cost_input = gr.Checkbox(
+                        label="Show Cost Estimates",
+                        value=True,
+                        info="Cloud GPU hourly/monthly costs"
+                    )
+            calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
+            with gr.Row():
+                with gr.Column(scale=3):
+                    output = gr.Markdown(label="Results")
+                with gr.Column(scale=1):
+                    chart_output = gr.BarPlot(
+                        x="Component",
+                        y="GB",
+                        title="Memory Breakdown",
+                        height=350,
+                    )
+            def run_calculation(
+                model_id, context_length, batch_size, mode, optimizer, serving,
+                num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
+            ):
+                result_text, chart_data = calculate_vram(
+                    model_id, context_length, batch_size, mode, optimizer, serving,
+                    num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
+                )
+                if chart_data:
+                    import pandas as pd
+                    df = pd.DataFrame({
+                        "Component": list(chart_data.keys()),
+                        "GB": list(chart_data.values())
+                    })
+                    return result_text, df
+                return result_text, None
+            calculate_btn.click(
+                fn=run_calculation,
+                inputs=[
+                    model_input, context_input, batch_input, mode_input,
+                    optimizer_input, serving_input, num_gpus_input, parallelism_input,
+                    flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
+                ],
+                outputs=[output, chart_output]
             )
+            # Examples
+            gr.Examples(
+                examples=[
+                    ["meta-llama/Llama-3.1-8B", 4096, 1],
+                    ["meta-llama/Llama-3.1-70B", 8192, 1],
+                    ["mistralai/Mistral-7B-v0.1", 8192, 1],
+                    ["Qwen/Qwen2.5-72B", 32768, 1],
+                    ["google/gemma-2-27b", 8192, 1],
+                    ["microsoft/phi-4", 16384, 1],
+                    ["deepseek-ai/DeepSeek-V3", 4096, 1],
+                    ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
+                ],
+                inputs=[model_input, context_input, batch_input],
+                label="Popular Models"
             )
+        # === COMPARE TAB ===
+        with gr.TabItem("Compare Models"):
+            gr.Markdown("""
+            Compare VRAM requirements across multiple models side-by-side.
+            Enter model IDs one per line (2-5 models).
+            """)
+            compare_models_input = gr.Textbox(
+                label="Model IDs (one per line)",
+                placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
+                lines=5,
+            )
+            compare_context_input = gr.Slider(
                 label="Context Length",
                 minimum=512,
                 maximum=131072,
                 value=4096,
                 step=512,
             )
+            compare_btn = gr.Button("Compare Models", variant="primary")
+            compare_output = gr.Markdown(label="Comparison Results")
+            compare_btn.click(
+                fn=compare_models,
+                inputs=[compare_models_input, compare_context_input],
+                outputs=compare_output
+            )
+            gr.Examples(
+                examples=[
+                    ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
+                    ["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B\nmeta-llama/Llama-3.3-70B-Instruct", 8192],
+                ],
+                inputs=[compare_models_input, compare_context_input],
+                label="Example Comparisons"
             )
+        # === EXPORT TAB ===
+        with gr.TabItem("Export"):
+            gr.Markdown("""
+            Export your calculation results to JSON or plain text format.
+            First run a calculation in the Calculator tab, then copy the results here.
+            """)
+            export_input = gr.Textbox(
+                label="Paste Results Here",
+                placeholder="Paste the calculation results from the Calculator tab...",
+                lines=10,
+            )
+            export_format = gr.Radio(
+                choices=["JSON", "Plain Text"],
+                value="JSON",
+                label="Export Format"
+            )
+            export_btn = gr.Button("Export", variant="primary")
+            export_output = gr.Textbox(
+                label="Exported Data",
+                lines=15,
+                show_copy_button=True,
+            )
+            export_btn.click(
+                fn=export_results,
+                inputs=[export_input, export_format],
+                outputs=export_output
+            )
+    # Notes outside tabs
     gr.Markdown("""
     ---
+    ### Notes
     - **Inference mode:** Weights + KV cache + framework overhead
+    - **Training modes:** Full training, LoRA, and QLoRA with different memory profiles
     - **KV cache:** Scales linearly with context length and batch size
     - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
     - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
+    ### Disclaimers
     - Estimates are approximate; actual usage varies by implementation
+    - Flash Attention and other optimizations can reduce peak memory
+    - Throughput estimates assume ideal conditions
+    Built with Gradio & HuggingFace Hub API
     """)