Livengood Claude commited on
Commit
26bc78c
·
1 Parent(s): 50bc6be

Add LoRA/QLoRA modes, model comparison, search, throughput, cost estimates, and export

Browse files

New features:
- LoRA and QLoRA fine-tuning memory estimation modes
- Model comparison tab for side-by-side VRAM analysis
- Model search with HuggingFace API integration
- Throughput estimation (tokens/sec) per GPU
- Cloud cost estimates (hourly/daily/monthly)
- Flash Attention toggle with memory savings display
- Export results to JSON or plain text
- Tabbed interface (Calculator, Compare, Export)

Updated GPU specs with TFLOPs and hourly costs for cloud instances.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +674 -136
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/*
2
+ .claude/*
app.py CHANGED
@@ -7,40 +7,45 @@ Fetches model metadata from HF Hub and calculates:
7
  - Recommended GPUs and cloud instances
8
  - Multi-GPU tensor parallelism estimates
9
  - Quantization options with detailed breakdown
 
 
 
 
10
  """
11
 
12
  import gradio as gr
13
- from huggingface_hub import HfApi, hf_hub_download
14
  import json
15
  from functools import lru_cache
 
16
 
17
  # Initialize HF API client
18
  api = HfApi()
19
 
20
- # GPU specs: name -> (VRAM in GB, typical cloud instance, category)
21
  GPU_SPECS = {
22
  # Consumer GPUs
23
- "RTX 3080": (10, "Consumer", "consumer"),
24
- "RTX 3090": (24, "Consumer", "consumer"),
25
- "RTX 4080": (16, "Consumer", "consumer"),
26
- "RTX 4090": (24, "Consumer", "consumer"),
27
- "RTX 5090": (32, "Consumer (est.)", "consumer"),
28
  # Apple Silicon
29
- "M2 Ultra": (192, "Mac Studio (Unified)", "apple"),
30
- "M3 Max": (128, "MacBook Pro (Unified)", "apple"),
31
- "M4 Max": (128, "MacBook Pro (Unified)", "apple"),
32
  # Workstation GPUs
33
- "RTX A6000": (48, "Workstation", "workstation"),
34
- "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud"),
35
  # Cloud GPUs
36
- "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud"),
37
- "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud"),
38
- "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud"),
39
- "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud"),
40
- "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud"),
41
- "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud"),
42
  # AMD GPUs
43
- "MI300X": (192, "AMD Cloud Instances", "amd"),
44
  }
45
 
46
  # Bytes per element for different dtypes
@@ -236,6 +241,175 @@ def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism:
236
  }
237
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def calculate_vram(
240
  model_id: str,
241
  context_length: int = 4096,
@@ -244,7 +418,11 @@ def calculate_vram(
244
  optimizer: str = "AdamW",
245
  serving_framework: str = "None (raw PyTorch)",
246
  num_gpus: int = 1,
247
- parallelism: str = "Tensor Parallelism"
 
 
 
 
248
  ) -> tuple[str, dict | None]:
249
  """Main calculation function. Returns (markdown_results, chart_data)."""
250
 
@@ -331,13 +509,25 @@ def calculate_vram(
331
  results.append("Could not find architecture details")
332
  kv_gb = 0
333
 
 
 
 
 
 
 
 
 
334
  # Calculate total based on mode
335
- if mode == "Training":
336
  training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
337
  base_gb = bytes_to_gb(training_mem["total_base"])
338
 
339
  # Activations estimation (rough: ~2x weights for typical batch)
340
  activation_gb = weights_gb * 2 * batch_size
 
 
 
 
341
  total_gb = base_gb + kv_gb + activation_gb
342
 
343
  results.append(f"\n### 🎓 Training Memory Breakdown")
@@ -354,11 +544,58 @@ def calculate_vram(
354
  "KV Cache": kv_gb,
355
  "Activations": activation_gb,
356
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  else:
358
  # Inference mode
359
  framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
360
  base_total = weights_gb + kv_gb
361
  overhead_gb = base_total * (framework_overhead - 1)
 
 
 
 
 
 
362
  total_gb = base_total + overhead_gb
363
 
364
  results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
@@ -372,6 +609,12 @@ def calculate_vram(
372
  "Overhead": overhead_gb,
373
  }
374
 
 
 
 
 
 
 
375
  results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
376
 
377
  # Multi-GPU calculations
@@ -389,14 +632,22 @@ def calculate_vram(
389
 
390
  # GPU Recommendations
391
  results.append(f"\n### 🎮 GPU Recommendations")
392
- results.append("| GPU | VRAM | Fits? | Headroom | Instance |")
393
- results.append("|-----|------|-------|----------|----------|")
394
 
395
- for gpu_name, (vram, instance, category) in GPU_SPECS.items():
396
  fits = "✅" if vram >= effective_vram_needed else "❌"
397
  headroom = vram - effective_vram_needed
398
  headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
399
- results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {instance} |")
 
 
 
 
 
 
 
 
400
 
401
  # Quantization options (if model doesn't fit on consumer GPUs)
402
  if effective_vram_needed > 24:
@@ -413,6 +664,17 @@ def calculate_vram(
413
 
414
  results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
415
 
 
 
 
 
 
 
 
 
 
 
 
416
  return "\n".join(results), chart_data
417
 
418
 
@@ -434,147 +696,423 @@ def create_memory_chart(chart_data: dict | None):
434
  )
435
 
436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  # Build Gradio interface
438
  with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
439
  gr.Markdown("""
440
- # 🧮 VRAM & Instance Type Calculator
441
 
442
- Estimate GPU memory requirements for HuggingFace models. Supports inference and training modes,
443
- multi-GPU setups, and provides detailed quantization recommendations.
444
  """)
445
 
446
- with gr.Row():
447
- with gr.Column(scale=2):
448
- model_input = gr.Textbox(
449
- label="Model ID",
450
- placeholder="meta-llama/Llama-3.1-8B",
451
- info="Full HuggingFace model ID (org/model-name)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  )
453
 
454
- with gr.Row():
455
- with gr.Column(scale=1):
456
- mode_input = gr.Radio(
457
- choices=["Inference", "Training"],
458
- value="Inference",
459
- label="Mode",
460
- info="Training requires ~4x more memory"
 
 
 
 
 
 
 
461
  )
462
- with gr.Column(scale=1):
463
- context_input = gr.Slider(
 
 
 
 
 
 
 
 
 
 
 
 
464
  label="Context Length",
465
  minimum=512,
466
  maximum=131072,
467
  value=4096,
468
  step=512,
469
- info="Sequence length for KV cache"
470
- )
471
- with gr.Column(scale=1):
472
- batch_input = gr.Slider(
473
- label="Batch Size",
474
- minimum=1,
475
- maximum=64,
476
- value=1,
477
- step=1,
478
- info="Concurrent sequences"
479
  )
 
 
480
 
481
- with gr.Accordion("⚙️ Advanced Options", open=False):
482
- with gr.Row():
483
- with gr.Column():
484
- serving_input = gr.Dropdown(
485
- choices=list(SERVING_FRAMEWORKS.keys()),
486
- value="None (raw PyTorch)",
487
- label="Serving Framework",
488
- info="Different frameworks have different overhead"
489
- )
490
- optimizer_input = gr.Dropdown(
491
- choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
492
- value="AdamW",
493
- label="Optimizer (Training mode)",
494
- info="Optimizer state memory varies"
495
- )
496
- with gr.Column():
497
- num_gpus_input = gr.Slider(
498
- label="Number of GPUs",
499
- minimum=1,
500
- maximum=8,
501
- value=1,
502
- step=1,
503
- info="For multi-GPU setups"
504
- )
505
- parallelism_input = gr.Dropdown(
506
- choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
507
- value="Tensor Parallelism",
508
- label="Parallelism Strategy",
509
- info="How to distribute across GPUs"
510
- )
511
 
512
- calculate_btn = gr.Button("🚀 Calculate VRAM", variant="primary", size="lg")
513
-
514
- with gr.Row():
515
- with gr.Column(scale=3):
516
- output = gr.Markdown(label="Results")
517
- with gr.Column(scale=1):
518
- chart_output = gr.BarPlot(
519
- x="Component",
520
- y="GB",
521
- title="Memory Breakdown",
522
- height=350,
523
  )
524
 
525
- def run_calculation(model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism):
526
- result_text, chart_data = calculate_vram(
527
- model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism
528
- )
529
- if chart_data:
530
- import pandas as pd
531
- df = pd.DataFrame({
532
- "Component": list(chart_data.keys()),
533
- "GB": list(chart_data.values())
534
- })
535
- return result_text, df
536
- return result_text, None
537
-
538
- calculate_btn.click(
539
- fn=run_calculation,
540
- inputs=[
541
- model_input, context_input, batch_input, mode_input,
542
- optimizer_input, serving_input, num_gpus_input, parallelism_input
543
- ],
544
- outputs=[output, chart_output]
545
- )
 
 
546
 
547
- # Examples
548
- gr.Examples(
549
- examples=[
550
- ["meta-llama/Llama-3.1-8B", 4096, 1],
551
- ["meta-llama/Llama-3.1-70B", 8192, 1],
552
- ["mistralai/Mistral-7B-v0.1", 8192, 1],
553
- ["Qwen/Qwen2.5-72B", 32768, 1],
554
- ["google/gemma-2-27b", 8192, 1],
555
- ["microsoft/phi-4", 16384, 1],
556
- ["deepseek-ai/DeepSeek-V3", 4096, 1],
557
- ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
558
- ],
559
- inputs=[model_input, context_input, batch_input],
560
- label="🔥 Popular Models"
561
- )
562
 
 
563
  gr.Markdown("""
564
  ---
565
- ### 📝 Notes
566
  - **Inference mode:** Weights + KV cache + framework overhead
567
- - **Training mode:** Adds gradients, optimizer states, and activation memory
568
  - **KV cache:** Scales linearly with context length and batch size
569
  - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
570
  - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
571
 
572
- ### ⚠️ Disclaimers
573
  - Estimates are approximate; actual usage varies by implementation
574
- - Flash Attention and other optimizations can significantly reduce memory
575
- - GGUF models have different memory profiles than safetensors
576
 
577
- Built with 💜 using Gradio & HuggingFace Hub API
578
  """)
579
 
580
 
 
7
  - Recommended GPUs and cloud instances
8
  - Multi-GPU tensor parallelism estimates
9
  - Quantization options with detailed breakdown
10
+ - Model comparison across multiple models
11
+ - Throughput estimation
12
+ - Cloud cost analysis
13
+ - LoRA/QLoRA fine-tuning memory requirements
14
  """
15
 
16
  import gradio as gr
17
+ from huggingface_hub import HfApi, hf_hub_download, list_models
18
  import json
19
  from functools import lru_cache
20
+ from datetime import datetime
21
 
22
  # Initialize HF API client
23
  api = HfApi()
24
 
25
+ # GPU specs: name -> (VRAM in GB, typical cloud instance, category, hourly_cost, tflops_fp16)
26
  GPU_SPECS = {
27
  # Consumer GPUs
28
+ "RTX 3080": (10, "Consumer", "consumer", 0, 29.8),
29
+ "RTX 3090": (24, "Consumer", "consumer", 0, 35.6),
30
+ "RTX 4080": (16, "Consumer", "consumer", 0, 48.7),
31
+ "RTX 4090": (24, "Consumer", "consumer", 0, 82.6),
32
+ "RTX 5090": (32, "Consumer (est.)", "consumer", 0, 105.0),
33
  # Apple Silicon
34
+ "M2 Ultra": (192, "Mac Studio (Unified)", "apple", 0, 27.2),
35
+ "M3 Max": (128, "MacBook Pro (Unified)", "apple", 0, 14.2),
36
+ "M4 Max": (128, "MacBook Pro (Unified)", "apple", 0, 18.0),
37
  # Workstation GPUs
38
+ "RTX A6000": (48, "Workstation", "workstation", 0, 38.7),
39
+ "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud", 1.00, 91.6),
40
  # Cloud GPUs
41
+ "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud", 1.00, 31.2),
42
+ "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud", 0.70, 30.3),
43
+ "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud", 3.00, 77.9),
44
+ "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud", 5.00, 77.9),
45
+ "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud", 8.00, 267.6),
46
+ "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud", 12.00, 296.0),
47
  # AMD GPUs
48
+ "MI300X": (192, "AMD Cloud Instances", "amd", 6.00, 383.0),
49
  }
50
 
51
  # Bytes per element for different dtypes
 
241
  }
242
 
243
 
244
+ def estimate_lora_memory(
245
+ param_count: int,
246
+ dtype_bytes: int,
247
+ lora_rank: int = 16,
248
+ lora_alpha: int = 32,
249
+ target_modules: int = 4,
250
+ use_qlora: bool = False
251
+ ) -> dict:
252
+ """
253
+ Estimate LoRA/QLoRA fine-tuning memory requirements.
254
+
255
+ LoRA adds low-rank adaptation matrices to specific layers.
256
+ QLoRA additionally quantizes the base model to 4-bit.
257
+ """
258
+ # Base model weights
259
+ if use_qlora:
260
+ # QLoRA: 4-bit quantized weights
261
+ base_weights_bytes = param_count * 0.5 # 4-bit = 0.5 bytes/param
262
+ else:
263
+ base_weights_bytes = param_count * dtype_bytes
264
+
265
+ # LoRA adapter parameters (A and B matrices for each target module)
266
+ # Typical target modules: q_proj, k_proj, v_proj, o_proj (4 modules)
267
+ # Each LoRA layer: hidden_size * rank (A) + rank * hidden_size (B)
268
+ # Approximate as 2 * hidden_size * rank per module
269
+ # For simplicity, estimate based on total params
270
+ lora_params_ratio = (lora_rank * 2 * target_modules) / 1000 # Rough estimate
271
+ lora_params = int(param_count * lora_params_ratio * 0.01) # Usually ~0.1-1% of base
272
+ lora_weights_bytes = lora_params * dtype_bytes
273
+
274
+ # Gradients only for LoRA params (not frozen base)
275
+ gradients_bytes = lora_params * dtype_bytes
276
+
277
+ # Optimizer states for LoRA params only
278
+ optimizer_bytes = lora_params * 4 * 2 # AdamW: 2 states, 4 bytes each
279
+
280
+ # Activations (still needed, but can use gradient checkpointing)
281
+ activation_bytes = base_weights_bytes * 0.5 # Reduced with checkpointing
282
+
283
+ return {
284
+ "base_weights": base_weights_bytes,
285
+ "lora_weights": lora_weights_bytes,
286
+ "lora_params": lora_params,
287
+ "gradients": gradients_bytes,
288
+ "optimizer": optimizer_bytes,
289
+ "activations": activation_bytes,
290
+ "total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
291
+ "vs_full_finetune_ratio": 0.3 if use_qlora else 0.5, # Rough memory savings
292
+ }
293
+
294
+
295
+ def estimate_throughput(
296
+ param_count: int,
297
+ gpu_tflops: float,
298
+ batch_size: int = 1,
299
+ context_length: int = 4096,
300
+ is_prefill: bool = False
301
+ ) -> dict:
302
+ """
303
+ Estimate tokens per second throughput.
304
+
305
+ Based on roofline model: throughput limited by compute or memory bandwidth.
306
+ Most LLM inference is memory-bound for single-batch decode.
307
+ """
308
+ # Rough estimate: 2 FLOPs per parameter per token (forward pass)
309
+ flops_per_token = 2 * param_count
310
+
311
+ # Peak theoretical throughput (compute-bound)
312
+ peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
313
+
314
+ # Memory-bound estimate (more realistic for decode)
315
+ # Assume ~1TB/s memory bandwidth for modern GPUs
316
+ memory_bandwidth_tbs = 1.0 # TB/s, rough average
317
+ bytes_per_token = param_count * 2 # FP16 weights need to be read
318
+ memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
319
+
320
+ # Prefill is more compute-bound, decode is memory-bound
321
+ if is_prefill:
322
+ effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
323
+ else:
324
+ effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
325
+
326
+ # Apply realistic efficiency factor (typically 30-60% of theoretical)
327
+ efficiency = 0.4
328
+ realistic_tokens = effective_tokens * efficiency
329
+
330
+ return {
331
+ "peak_theoretical": peak_tokens_per_sec,
332
+ "memory_bound": memory_bound_tokens,
333
+ "estimated_tokens_per_sec": realistic_tokens,
334
+ "batch_size": batch_size,
335
+ "is_prefill": is_prefill,
336
+ }
337
+
338
+
339
+ def calculate_cost_estimate(
340
+ vram_required: float,
341
+ hours_per_day: float = 8,
342
+ days_per_month: float = 22
343
+ ) -> list:
344
+ """Calculate cost estimates for cloud GPUs that fit the model."""
345
+ estimates = []
346
+
347
+ for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
348
+ if vram >= vram_required and hourly_cost > 0:
349
+ daily_cost = hourly_cost * hours_per_day
350
+ monthly_cost = daily_cost * days_per_month
351
+ estimates.append({
352
+ "gpu": gpu_name,
353
+ "vram": vram,
354
+ "hourly": hourly_cost,
355
+ "daily": daily_cost,
356
+ "monthly": monthly_cost,
357
+ "instance": instance,
358
+ })
359
+
360
+ return sorted(estimates, key=lambda x: x["hourly"])
361
+
362
+
363
+ def search_models(query: str, limit: int = 10) -> list:
364
+ """Search HuggingFace models by name."""
365
+ if not query or len(query) < 2:
366
+ return []
367
+
368
+ try:
369
+ models = list(list_models(
370
+ search=query,
371
+ sort="downloads",
372
+ direction=-1,
373
+ limit=limit,
374
+ filter="text-generation"
375
+ ))
376
+ return [m.id for m in models]
377
+ except Exception:
378
+ return []
379
+
380
+
381
+ def calculate_flash_attention_savings(
382
+ kv_cache_bytes: int,
383
+ context_length: int
384
+ ) -> dict:
385
+ """
386
+ Estimate memory savings from Flash Attention.
387
+
388
+ Flash Attention uses tiling to reduce memory from O(n^2) to O(n).
389
+ """
390
+ # Standard attention materializes full attention matrix
391
+ # Flash Attention streams through, never materializing full matrix
392
+ # Savings primarily in activation memory, not KV cache
393
+
394
+ # KV cache itself is O(n), so Flash Attention doesn't reduce it
395
+ # But it dramatically reduces peak memory during computation
396
+
397
+ # Estimate: Flash Attention reduces peak memory by avoiding
398
+ # the O(n^2) attention matrix materialization
399
+ standard_attention_overhead = context_length * context_length * 2 # FP16
400
+ flash_attention_overhead = context_length * 128 * 2 # Block size overhead
401
+
402
+ savings_bytes = standard_attention_overhead - flash_attention_overhead
403
+ savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
404
+
405
+ return {
406
+ "standard_overhead_gb": bytes_to_gb(standard_attention_overhead),
407
+ "flash_overhead_gb": bytes_to_gb(flash_attention_overhead),
408
+ "savings_gb": bytes_to_gb(savings_bytes),
409
+ "savings_percent": savings_ratio * 100,
410
+ }
411
+
412
+
413
  def calculate_vram(
414
  model_id: str,
415
  context_length: int = 4096,
 
418
  optimizer: str = "AdamW",
419
  serving_framework: str = "None (raw PyTorch)",
420
  num_gpus: int = 1,
421
+ parallelism: str = "Tensor Parallelism",
422
+ use_flash_attention: bool = True,
423
+ lora_rank: int = 16,
424
+ show_throughput: bool = True,
425
+ show_cost: bool = True
426
  ) -> tuple[str, dict | None]:
427
  """Main calculation function. Returns (markdown_results, chart_data)."""
428
 
 
509
  results.append("Could not find architecture details")
510
  kv_gb = 0
511
 
512
+ # Flash Attention savings
513
+ flash_savings = None
514
+ if use_flash_attention and kv_gb > 0:
515
+ kv_bytes = estimate_kv_cache_size(
516
+ num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
517
+ )
518
+ flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
519
+
520
  # Calculate total based on mode
521
+ if mode == "Training (Full)":
522
  training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
523
  base_gb = bytes_to_gb(training_mem["total_base"])
524
 
525
  # Activations estimation (rough: ~2x weights for typical batch)
526
  activation_gb = weights_gb * 2 * batch_size
527
+ if use_flash_attention and flash_savings:
528
+ activation_gb -= flash_savings["savings_gb"]
529
+ activation_gb = max(0.1, activation_gb)
530
+
531
  total_gb = base_gb + kv_gb + activation_gb
532
 
533
  results.append(f"\n### 🎓 Training Memory Breakdown")
 
544
  "KV Cache": kv_gb,
545
  "Activations": activation_gb,
546
  }
547
+
548
+ elif mode == "LoRA Fine-tuning":
549
+ lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
550
+ total_gb = bytes_to_gb(lora_mem["total"])
551
+
552
+ results.append(f"\n### 🔧 LoRA Fine-tuning (rank={lora_rank})")
553
+ results.append(f"- **Base weights (frozen):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
554
+ results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
555
+ results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
556
+ results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
557
+ results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
558
+ results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
559
+
560
+ chart_data = {
561
+ "Base Weights": bytes_to_gb(lora_mem['base_weights']),
562
+ "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
563
+ "Gradients": bytes_to_gb(lora_mem['gradients']),
564
+ "Optimizer": bytes_to_gb(lora_mem['optimizer']),
565
+ "Activations": bytes_to_gb(lora_mem['activations']),
566
+ }
567
+
568
+ elif mode == "QLoRA Fine-tuning":
569
+ lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
570
+ total_gb = bytes_to_gb(lora_mem["total"])
571
+
572
+ results.append(f"\n### 🔧 QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
573
+ results.append(f"- **Base weights (4-bit):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
574
+ results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
575
+ results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
576
+ results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
577
+ results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
578
+ results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
579
+
580
+ chart_data = {
581
+ "Base (4-bit)": bytes_to_gb(lora_mem['base_weights']),
582
+ "LoRA Adapters": bytes_to_gb(lora_mem['lora_weights']),
583
+ "Gradients": bytes_to_gb(lora_mem['gradients']),
584
+ "Optimizer": bytes_to_gb(lora_mem['optimizer']),
585
+ "Activations": bytes_to_gb(lora_mem['activations']),
586
+ }
587
+
588
  else:
589
  # Inference mode
590
  framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
591
  base_total = weights_gb + kv_gb
592
  overhead_gb = base_total * (framework_overhead - 1)
593
+
594
+ # Flash Attention reduces activation memory overhead during inference
595
+ if use_flash_attention and flash_savings:
596
+ overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
597
+ overhead_gb = max(0, overhead_gb)
598
+
599
  total_gb = base_total + overhead_gb
600
 
601
  results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
 
609
  "Overhead": overhead_gb,
610
  }
611
 
612
+ # Flash Attention info
613
+ if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
614
+ results.append(f"\n### ⚡ Flash Attention")
615
+ results.append(f"- **Enabled:** Yes")
616
+ results.append(f"- **Peak memory savings:** ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
617
+
618
  results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
619
 
620
  # Multi-GPU calculations
 
632
 
633
  # GPU Recommendations
634
  results.append(f"\n### 🎮 GPU Recommendations")
635
+ results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
636
+ results.append("|-----|------|-------|----------|------------|----------|")
637
 
638
+ for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
639
  fits = "✅" if vram >= effective_vram_needed else "❌"
640
  headroom = vram - effective_vram_needed
641
  headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
642
+
643
+ # Estimate throughput for this GPU
644
+ if show_throughput and vram >= effective_vram_needed:
645
+ throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
646
+ tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
647
+ else:
648
+ tok_str = "-"
649
+
650
+ results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
651
 
652
  # Quantization options (if model doesn't fit on consumer GPUs)
653
  if effective_vram_needed > 24:
 
664
 
665
  results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
666
 
667
+ # Cost estimates for cloud GPUs
668
+ if show_cost:
669
+ cost_estimates = calculate_cost_estimate(effective_vram_needed)
670
+ if cost_estimates:
671
+ results.append(f"\n### 💰 Cloud Cost Estimates")
672
+ results.append("*Based on 8 hrs/day, 22 days/month*\n")
673
+ results.append("| GPU | Hourly | Daily | Monthly |")
674
+ results.append("|-----|--------|-------|---------|")
675
+ for est in cost_estimates[:5]: # Top 5 cheapest
676
+ results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
677
+
678
  return "\n".join(results), chart_data
679
 
680
 
 
696
  )
697
 
698
 
699
+ def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
700
+ """Compare multiple models side by side."""
701
+ model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
702
+
703
+ if len(model_ids) < 2:
704
+ return "Please enter at least 2 model IDs (one per line)"
705
+
706
+ if len(model_ids) > 5:
707
+ return "Maximum 5 models for comparison"
708
+
709
+ results = ["## Model Comparison\n"]
710
+ comparison_data = []
711
+
712
+ for model_id in model_ids:
713
+ try:
714
+ info = get_model_info(model_id)
715
+ config = get_config(model_id)
716
+ param_count, dominant_dtype = estimate_params_from_safetensors(info)
717
+
718
+ if param_count == 0:
719
+ comparison_data.append({
720
+ "model": model_id,
721
+ "params": "N/A",
722
+ "error": "Could not determine parameters"
723
+ })
724
+ continue
725
+
726
+ dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
727
+ weights_gb = bytes_to_gb(param_count * dtype_bytes)
728
+
729
+ num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
730
+ num_kv_heads = config.get("num_key_value_heads",
731
+ config.get("num_attention_heads", 0))
732
+ head_dim = get_head_dim(config)
733
+
734
+ kv_bytes = estimate_kv_cache_size(
735
+ num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes
736
+ )
737
+ kv_gb = bytes_to_gb(kv_bytes)
738
+ total_inference = weights_gb + kv_gb
739
+
740
+ # Training estimate
741
+ training_mem = estimate_training_memory(param_count, dtype_bytes)
742
+ training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
743
+
744
+ # QLoRA estimate
745
+ qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
746
+ qlora_gb = bytes_to_gb(qlora_mem["total"])
747
+
748
+ comparison_data.append({
749
+ "model": model_id.split("/")[-1],
750
+ "full_id": model_id,
751
+ "params": f"{param_count/1e9:.1f}B",
752
+ "dtype": dominant_dtype,
753
+ "weights_gb": weights_gb,
754
+ "kv_gb": kv_gb,
755
+ "inference_gb": total_inference,
756
+ "training_gb": training_gb,
757
+ "qlora_gb": qlora_gb,
758
+ })
759
+ except Exception as e:
760
+ comparison_data.append({
761
+ "model": model_id,
762
+ "error": str(e)
763
+ })
764
+
765
+ # Build comparison table
766
+ results.append(f"*Context length: {context_length:,}*\n")
767
+ results.append("| Model | Params | Inference | Training | QLoRA |")
768
+ results.append("|-------|--------|-----------|----------|-------|")
769
+
770
+ for data in comparison_data:
771
+ if "error" in data:
772
+ results.append(f"| {data['model']} | Error | - | - | - |")
773
+ else:
774
+ results.append(
775
+ f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
776
+ f"{data['params']} | "
777
+ f"{data['inference_gb']:.1f} GB | "
778
+ f"{data['training_gb']:.1f} GB | "
779
+ f"{data['qlora_gb']:.1f} GB |"
780
+ )
781
+
782
+ # Find minimum for each category
783
+ valid_data = [d for d in comparison_data if "error" not in d]
784
+ if len(valid_data) >= 2:
785
+ results.append("\n### Recommendations")
786
+
787
+ min_inference = min(valid_data, key=lambda x: x["inference_gb"])
788
+ min_training = min(valid_data, key=lambda x: x["training_gb"])
789
+ min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
790
+
791
+ results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
792
+ results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
793
+ results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
794
+
795
+ return "\n".join(results)
796
+
797
+
798
+ def export_results(result_text: str, format_type: str) -> str:
799
+ """Export results to different formats."""
800
+ if not result_text:
801
+ return "No results to export. Run a calculation first."
802
+
803
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
804
+
805
+ if format_type == "JSON":
806
+ # Parse markdown to create structured JSON
807
+ import re
808
+ lines = result_text.split("\n")
809
+ data = {
810
+ "timestamp": timestamp,
811
+ "raw_markdown": result_text,
812
+ "sections": {}
813
+ }
814
+
815
+ current_section = "header"
816
+ for line in lines:
817
+ if line.startswith("### "):
818
+ current_section = line.replace("### ", "").strip()
819
+ data["sections"][current_section] = []
820
+ elif line.strip():
821
+ if current_section not in data["sections"]:
822
+ data["sections"][current_section] = []
823
+ data["sections"][current_section].append(line.strip())
824
+
825
+ return json.dumps(data, indent=2)
826
+
827
+ else: # Plain text
828
+ # Convert markdown to plain text
829
+ plain = result_text
830
+ plain = plain.replace("**", "")
831
+ plain = plain.replace("###", "\n===")
832
+ plain = plain.replace("##", "\n===")
833
+ plain = f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
834
+ return plain
835
+
836
+
837
  # Build Gradio interface
838
  with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
839
  gr.Markdown("""
840
+ # VRAM & Instance Type Calculator
841
 
842
+ Estimate GPU memory requirements for HuggingFace models. Supports inference, training, LoRA/QLoRA fine-tuning,
843
+ multi-GPU setups, model comparison, and detailed quantization recommendations.
844
  """)
845
 
846
+ with gr.Tabs():
847
+ # === CALCULATOR TAB ===
848
+ with gr.TabItem("Calculator"):
849
+ with gr.Row():
850
+ with gr.Column(scale=2):
851
+ model_input = gr.Textbox(
852
+ label="Model ID",
853
+ placeholder="meta-llama/Llama-3.1-8B",
854
+ info="Full HuggingFace model ID (org/model-name)"
855
+ )
856
+ with gr.Column(scale=1):
857
+ search_input = gr.Textbox(
858
+ label="Search Models",
859
+ placeholder="llama 8b",
860
+ info="Search HuggingFace for models"
861
+ )
862
+ search_btn = gr.Button("Search", size="sm")
863
+
864
+ with gr.Row(visible=False) as search_results_row:
865
+ search_results = gr.Dropdown(
866
+ label="Search Results (click to select)",
867
+ choices=[],
868
+ interactive=True,
869
+ )
870
+
871
+ def do_search(query):
872
+ if not query:
873
+ return gr.update(visible=False), gr.update(choices=[])
874
+ results = search_models(query, limit=10)
875
+ if results:
876
+ return gr.update(visible=True), gr.update(choices=results, value=results[0])
877
+ return gr.update(visible=True), gr.update(choices=["No models found"], value=None)
878
+
879
+ def select_model(selected):
880
+ if selected and selected != "No models found":
881
+ return selected
882
+ return ""
883
+
884
+ search_btn.click(
885
+ fn=do_search,
886
+ inputs=[search_input],
887
+ outputs=[search_results_row, search_results]
888
+ )
889
+ search_results.change(
890
+ fn=select_model,
891
+ inputs=[search_results],
892
+ outputs=[model_input]
893
+ )
894
+
895
+ with gr.Row():
896
+ with gr.Column(scale=1):
897
+ mode_input = gr.Radio(
898
+ choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
899
+ value="Inference",
900
+ label="Mode",
901
+ info="LoRA/QLoRA use significantly less memory"
902
+ )
903
+ with gr.Column(scale=1):
904
+ context_input = gr.Slider(
905
+ label="Context Length",
906
+ minimum=512,
907
+ maximum=131072,
908
+ value=4096,
909
+ step=512,
910
+ info="Sequence length for KV cache"
911
+ )
912
+ with gr.Column(scale=1):
913
+ batch_input = gr.Slider(
914
+ label="Batch Size",
915
+ minimum=1,
916
+ maximum=64,
917
+ value=1,
918
+ step=1,
919
+ info="Concurrent sequences"
920
+ )
921
+
922
+ with gr.Accordion("Advanced Options", open=False):
923
+ with gr.Row():
924
+ with gr.Column():
925
+ serving_input = gr.Dropdown(
926
+ choices=list(SERVING_FRAMEWORKS.keys()),
927
+ value="None (raw PyTorch)",
928
+ label="Serving Framework",
929
+ info="Different frameworks have different overhead"
930
+ )
931
+ optimizer_input = gr.Dropdown(
932
+ choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
933
+ value="AdamW",
934
+ label="Optimizer (Training mode)",
935
+ info="Optimizer state memory varies"
936
+ )
937
+ lora_rank_input = gr.Slider(
938
+ label="LoRA Rank",
939
+ minimum=4,
940
+ maximum=128,
941
+ value=16,
942
+ step=4,
943
+ info="Higher rank = more capacity but more memory"
944
+ )
945
+ with gr.Column():
946
+ num_gpus_input = gr.Slider(
947
+ label="Number of GPUs",
948
+ minimum=1,
949
+ maximum=8,
950
+ value=1,
951
+ step=1,
952
+ info="For multi-GPU setups"
953
+ )
954
+ parallelism_input = gr.Dropdown(
955
+ choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
956
+ value="Tensor Parallelism",
957
+ label="Parallelism Strategy",
958
+ info="How to distribute across GPUs"
959
+ )
960
+ flash_attention_input = gr.Checkbox(
961
+ label="Use Flash Attention",
962
+ value=True,
963
+ info="Reduces peak memory usage"
964
+ )
965
+ with gr.Row():
966
+ show_throughput_input = gr.Checkbox(
967
+ label="Show Throughput Estimates",
968
+ value=True,
969
+ info="Estimated tokens/sec per GPU"
970
+ )
971
+ show_cost_input = gr.Checkbox(
972
+ label="Show Cost Estimates",
973
+ value=True,
974
+ info="Cloud GPU hourly/monthly costs"
975
+ )
976
+
977
+ calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
978
+
979
+ with gr.Row():
980
+ with gr.Column(scale=3):
981
+ output = gr.Markdown(label="Results")
982
+ with gr.Column(scale=1):
983
+ chart_output = gr.BarPlot(
984
+ x="Component",
985
+ y="GB",
986
+ title="Memory Breakdown",
987
+ height=350,
988
+ )
989
+
990
+ def run_calculation(
991
+ model_id, context_length, batch_size, mode, optimizer, serving,
992
+ num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
993
+ ):
994
+ result_text, chart_data = calculate_vram(
995
+ model_id, context_length, batch_size, mode, optimizer, serving,
996
+ num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
997
+ )
998
+ if chart_data:
999
+ import pandas as pd
1000
+ df = pd.DataFrame({
1001
+ "Component": list(chart_data.keys()),
1002
+ "GB": list(chart_data.values())
1003
+ })
1004
+ return result_text, df
1005
+ return result_text, None
1006
+
1007
+ calculate_btn.click(
1008
+ fn=run_calculation,
1009
+ inputs=[
1010
+ model_input, context_input, batch_input, mode_input,
1011
+ optimizer_input, serving_input, num_gpus_input, parallelism_input,
1012
+ flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
1013
+ ],
1014
+ outputs=[output, chart_output]
1015
  )
1016
 
1017
+ # Examples
1018
+ gr.Examples(
1019
+ examples=[
1020
+ ["meta-llama/Llama-3.1-8B", 4096, 1],
1021
+ ["meta-llama/Llama-3.1-70B", 8192, 1],
1022
+ ["mistralai/Mistral-7B-v0.1", 8192, 1],
1023
+ ["Qwen/Qwen2.5-72B", 32768, 1],
1024
+ ["google/gemma-2-27b", 8192, 1],
1025
+ ["microsoft/phi-4", 16384, 1],
1026
+ ["deepseek-ai/DeepSeek-V3", 4096, 1],
1027
+ ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
1028
+ ],
1029
+ inputs=[model_input, context_input, batch_input],
1030
+ label="Popular Models"
1031
  )
1032
+
1033
+ # === COMPARE TAB ===
1034
+ with gr.TabItem("Compare Models"):
1035
+ gr.Markdown("""
1036
+ Compare VRAM requirements across multiple models side-by-side.
1037
+ Enter model IDs one per line (2-5 models).
1038
+ """)
1039
+
1040
+ compare_models_input = gr.Textbox(
1041
+ label="Model IDs (one per line)",
1042
+ placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
1043
+ lines=5,
1044
+ )
1045
+ compare_context_input = gr.Slider(
1046
  label="Context Length",
1047
  minimum=512,
1048
  maximum=131072,
1049
  value=4096,
1050
  step=512,
 
 
 
 
 
 
 
 
 
 
1051
  )
1052
+ compare_btn = gr.Button("Compare Models", variant="primary")
1053
+ compare_output = gr.Markdown(label="Comparison Results")
1054
 
1055
+ compare_btn.click(
1056
+ fn=compare_models,
1057
+ inputs=[compare_models_input, compare_context_input],
1058
+ outputs=compare_output
1059
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
 
1061
+ gr.Examples(
1062
+ examples=[
1063
+ ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
1064
+ ["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B\nmeta-llama/Llama-3.3-70B-Instruct", 8192],
1065
+ ],
1066
+ inputs=[compare_models_input, compare_context_input],
1067
+ label="Example Comparisons"
 
 
 
 
1068
  )
1069
 
1070
+ # === EXPORT TAB ===
1071
+ with gr.TabItem("Export"):
1072
+ gr.Markdown("""
1073
+ Export your calculation results to JSON or plain text format.
1074
+ First run a calculation in the Calculator tab, then copy the results here.
1075
+ """)
1076
+
1077
+ export_input = gr.Textbox(
1078
+ label="Paste Results Here",
1079
+ placeholder="Paste the calculation results from the Calculator tab...",
1080
+ lines=10,
1081
+ )
1082
+ export_format = gr.Radio(
1083
+ choices=["JSON", "Plain Text"],
1084
+ value="JSON",
1085
+ label="Export Format"
1086
+ )
1087
+ export_btn = gr.Button("Export", variant="primary")
1088
+ export_output = gr.Textbox(
1089
+ label="Exported Data",
1090
+ lines=15,
1091
+ show_copy_button=True,
1092
+ )
1093
 
1094
+ export_btn.click(
1095
+ fn=export_results,
1096
+ inputs=[export_input, export_format],
1097
+ outputs=export_output
1098
+ )
 
 
 
 
 
 
 
 
 
 
1099
 
1100
+ # Notes outside tabs
1101
  gr.Markdown("""
1102
  ---
1103
+ ### Notes
1104
  - **Inference mode:** Weights + KV cache + framework overhead
1105
+ - **Training modes:** Full training, LoRA, and QLoRA with different memory profiles
1106
  - **KV cache:** Scales linearly with context length and batch size
1107
  - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
1108
  - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
1109
 
1110
+ ### Disclaimers
1111
  - Estimates are approximate; actual usage varies by implementation
1112
+ - Flash Attention and other optimizations can reduce peak memory
1113
+ - Throughput estimates assume ideal conditions
1114
 
1115
+ Built with Gradio & HuggingFace Hub API
1116
  """)
1117
 
1118