feat: complete 4-method benchmark with honest memory reporting

Key finding: Naive uint8 storage = same as uniform 8-bit (2.0x)
Triton true bit-packing = 2.3x (Mistral) / 2.04x (Llama)
True bit-packing is REQUIRED to realize theoretical compression
All graphs updated with 4 methods

Files changed (5) hide show

benchmark_long_context.py +35 -23
results/llama-3-8b/long_context_results.json +32 -26
results/mistral-7b/long_context_results.json +35 -28
visualize_long_context.py +86 -100
visualize_results.py +104 -68

benchmark_long_context.py CHANGED Viewed

@@ -1,19 +1,20 @@
 """
 Long context benchmarks at 16K and 32K.
 This is where KV cache compression matters most.
 """
 import torch
 import json
 import os
 import sys
 import time
-import math
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from datasets import load_dataset
 sys.path.append(os.path.expanduser("~/kv-hack"))
 from kernel.quant_cache import MixedPrecisionKVCache
 MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "mistral-7b"
 MODEL_PATHS = {
     "mistral-7b": "~/kv-hack/mistral-model",
@@ -32,7 +33,7 @@ num_layers = len(bit_alloc)
 avg_bits   = sum(b for l in bit_alloc.values() for b in l) / \
              sum(len(l) for l in bit_alloc.values())
-print(f"Model: {MODEL_NAME}")
 print(f"Avg bits: {avg_bits:.2f}")
 print("Loading model...")
@@ -43,6 +44,7 @@ model     = AutoModelForCausalLM.from_pretrained(
 model.eval()
 print(f"Loaded: {torch.cuda.memory_allocated()/1e9:.2f} GB")
 def measure_context(context_len: int):
     print(f"\n  Context {context_len} tokens...")
     input_ids = torch.randint(1, 1000, (1, context_len)).cuda()
@@ -55,21 +57,28 @@ def measure_context(context_len: int):
     torch.cuda.synchronize()
     peak_mem = torch.cuda.max_memory_allocated() / 1e9
-    # KV compression
     fp16_bytes       = 0
     uniform8_bytes   = 0
-    compressed_bytes = 0
     for layer_idx in range(num_layers):
         k = kv.layers[layer_idx].keys
         v = kv.layers[layer_idx].values
         fp16_bytes     += k.numel() * 2 + v.numel() * 2
         uniform8_bytes += k.numel() + v.numel()
-        cache = MixedPrecisionKVCache(bit_alloc[layer_idx])
-        cache.store(k, v)
-        compressed_bytes += cache.memory_bytes()
-    # prefill speed
     times = []
     for _ in range(3):
         torch.cuda.synchronize()
@@ -85,15 +94,18 @@ def measure_context(context_len: int):
         "peak_memory_gb":       round(peak_mem, 2),
         "fp16_mb":              round(fp16_bytes / 1e6, 2),
         "uniform8_mb":          round(uniform8_bytes / 1e6, 2),
-        "mixed_precision_mb":   round(compressed_bytes / 1e6, 2),
-        "compression_vs_fp16":  round(fp16_bytes / compressed_bytes, 2),
-        "compression_vs_8bit":  round(uniform8_bytes / compressed_bytes, 2),
         "prefill_ms":           prefill_ms,
     }
-print("\n" + "="*60)
-print("LONG CONTEXT BENCHMARK")
-print("="*60)
 results = []
 for ctx in [512, 1024, 2048, 4096, 8192, 16384, 32768]:
@@ -103,17 +115,17 @@ for ctx in [512, 1024, 2048, 4096, 8192, 16384, 32768]:
         print(f"  ctx={ctx:6d} | "
               f"mem={r['peak_memory_gb']:.2f}GB | "
               f"FP16={r['fp16_mb']:.0f}MB | "
-              f"Ours={r['mixed_precision_mb']:.0f}MB | "
-              f"{r['compression_vs_fp16']}x | "
               f"prefill={r['prefill_ms']}ms")
     except torch.cuda.OutOfMemoryError:
-        print(f"  ctx={ctx:6d} | OOM — FP16 ran out of memory ✓")
-        # still measure our compressed version
         results.append({
-            "context_len": ctx,
             "peak_memory_gb": "OOM",
-            "fp16_mb": ctx * num_layers * 8 * 128 * 4 / 1e6,
-            "note": "FP16 OOM, compressed might fit"
         })
         break
@@ -121,4 +133,4 @@ for ctx in [512, 1024, 2048, 4096, 8192, 16384, 32768]:
 out_path = f"{results_dir}/long_context_results.json"
 with open(out_path, "w") as f:
     json.dump({"model": MODEL_NAME, "results": results}, f, indent=2)
-print(f"\n✅ Saved to {out_path}")

 """
 Long context benchmarks at 16K and 32K.
 This is where KV cache compression matters most.
+4 methods: FP16, Uniform 8-bit, Naive Per-Head, Triton True 4-bit
 """
 import torch
 import json
 import os
 import sys
 import time
 from transformers import AutoTokenizer, AutoModelForCausalLM
 sys.path.append(os.path.expanduser("~/kv-hack"))
 from kernel.quant_cache import MixedPrecisionKVCache
+from kernel.quant_cache_triton import MixedPrecisionKVCacheTriton
+# ── config ──────────────────────────────────────────
 MODEL_NAME = sys.argv[1] if len(sys.argv) > 1 else "mistral-7b"
 MODEL_PATHS = {
     "mistral-7b": "~/kv-hack/mistral-model",
 avg_bits   = sum(b for l in bit_alloc.values() for b in l) / \
              sum(len(l) for l in bit_alloc.values())
+print(f"Model:    {MODEL_NAME}")
 print(f"Avg bits: {avg_bits:.2f}")
 print("Loading model...")
 model.eval()
 print(f"Loaded: {torch.cuda.memory_allocated()/1e9:.2f} GB")
 def measure_context(context_len: int):
     print(f"\n  Context {context_len} tokens...")
     input_ids = torch.randint(1, 1000, (1, context_len)).cuda()
     torch.cuda.synchronize()
     peak_mem = torch.cuda.max_memory_allocated() / 1e9
+    # KV compression — all 4 methods
     fp16_bytes       = 0
     uniform8_bytes   = 0
+    naive_real_bytes = 0
+    triton_bytes     = 0
     for layer_idx in range(num_layers):
         k = kv.layers[layer_idx].keys
         v = kv.layers[layer_idx].values
         fp16_bytes     += k.numel() * 2 + v.numel() * 2
         uniform8_bytes += k.numel() + v.numel()
+        cache_naive = MixedPrecisionKVCache(bit_alloc[layer_idx])
+        cache_naive.store(k, v)
+        naive_real_bytes += cache_naive.real_gpu_bytes()
+        cache_triton = MixedPrecisionKVCacheTriton(bit_alloc[layer_idx])
+        cache_triton.store(k, v)
+        triton_bytes += cache_triton.memory_bytes()
+    # prefill speed (3 runs average)
     times = []
     for _ in range(3):
         torch.cuda.synchronize()
         "peak_memory_gb":       round(peak_mem, 2),
         "fp16_mb":              round(fp16_bytes / 1e6, 2),
         "uniform8_mb":          round(uniform8_bytes / 1e6, 2),
+        "naive_real_gpu_mb":    round(naive_real_bytes / 1e6, 2),
+        "triton_mb":            round(triton_bytes / 1e6, 2),
+        "naive_compression":    round(fp16_bytes / naive_real_bytes, 2),
+        "triton_compression":   round(fp16_bytes / triton_bytes, 2),
         "prefill_ms":           prefill_ms,
     }
+# ── RUN ──────────────────────────────────────────────
+print("\n" + "="*75)
+print("LONG CONTEXT BENCHMARK — 4 METHODS")
+print("="*75)
 results = []
 for ctx in [512, 1024, 2048, 4096, 8192, 16384, 32768]:
         print(f"  ctx={ctx:6d} | "
               f"mem={r['peak_memory_gb']:.2f}GB | "
               f"FP16={r['fp16_mb']:.0f}MB | "
+              f"8bit={r['uniform8_mb']:.0f}MB | "
+              f"Naive={r['naive_real_gpu_mb']:.0f}MB({r['naive_compression']}x) | "
+              f"Triton={r['triton_mb']:.0f}MB({r['triton_compression']}x) | "
               f"prefill={r['prefill_ms']}ms")
     except torch.cuda.OutOfMemoryError:
+        print(f"  ctx={ctx:6d} | OOM at FP16 — compressed methods would fit ✓")
         results.append({
+            "context_len":    ctx,
             "peak_memory_gb": "OOM",
+            "fp16_mb":        round(ctx * num_layers * 2 * 8 * 128 * 2 / 1e6, 2),
+            "note":           "FP16 OOM"
         })
         break
 out_path = f"{results_dir}/long_context_results.json"
 with open(out_path, "w") as f:
     json.dump({"model": MODEL_NAME, "results": results}, f, indent=2)
+print(f"\n✅ Saved to {out_path}")

results/llama-3-8b/long_context_results.json CHANGED Viewed

@@ -6,66 +6,72 @@
       "peak_memory_gb": 16.27,
       "fp16_mb": 67.11,
       "uniform8_mb": 33.55,
-      "mixed_precision_mb": 32.9,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 50.3
     },
     {
       "context_len": 1024,
       "peak_memory_gb": 16.47,
       "fp16_mb": 134.22,
       "uniform8_mb": 67.11,
-      "mixed_precision_mb": 65.8,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 89.1
     },
     {
       "context_len": 2048,
       "peak_memory_gb": 16.88,
       "fp16_mb": 268.44,
       "uniform8_mb": 134.22,
-      "mixed_precision_mb": 131.6,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 172.4
     },
     {
       "context_len": 4096,
       "peak_memory_gb": 17.69,
       "fp16_mb": 536.87,
       "uniform8_mb": 268.44,
-      "mixed_precision_mb": 263.2,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 349.8
     },
     {
       "context_len": 8192,
       "peak_memory_gb": 19.31,
       "fp16_mb": 1073.74,
       "uniform8_mb": 536.87,
-      "mixed_precision_mb": 526.39,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 735.4
     },
     {
       "context_len": 16384,
       "peak_memory_gb": 22.55,
       "fp16_mb": 2147.48,
       "uniform8_mb": 1073.74,
-      "mixed_precision_mb": 1052.77,
-      "compression_vs_fp16": 2.04,
-      "compression_vs_8bit": 1.02,
-      "prefill_ms": 1628.0
     },
     {
       "context_len": 32768,
       "peak_memory_gb": "OOM",
-      "fp16_mb": 4294.967296,
-      "note": "FP16 OOM, compressed might fit"
     }
   ]
 }

       "peak_memory_gb": 16.27,
       "fp16_mb": 67.11,
       "uniform8_mb": 33.55,
+      "naive_real_gpu_mb": 33.56,
+      "triton_mb": 32.9,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 47.7
     },
     {
       "context_len": 1024,
       "peak_memory_gb": 16.47,
       "fp16_mb": 134.22,
       "uniform8_mb": 67.11,
+      "naive_real_gpu_mb": 67.11,
+      "triton_mb": 65.8,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 88.8
     },
     {
       "context_len": 2048,
       "peak_memory_gb": 16.88,
       "fp16_mb": 268.44,
       "uniform8_mb": 134.22,
+      "naive_real_gpu_mb": 134.22,
+      "triton_mb": 131.6,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 172.6
     },
     {
       "context_len": 4096,
       "peak_memory_gb": 17.69,
       "fp16_mb": 536.87,
       "uniform8_mb": 268.44,
+      "naive_real_gpu_mb": 268.44,
+      "triton_mb": 263.2,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 350.2
     },
     {
       "context_len": 8192,
       "peak_memory_gb": 19.31,
       "fp16_mb": 1073.74,
       "uniform8_mb": 536.87,
+      "naive_real_gpu_mb": 536.88,
+      "triton_mb": 526.39,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 735.8
     },
     {
       "context_len": 16384,
       "peak_memory_gb": 22.55,
       "fp16_mb": 2147.48,
       "uniform8_mb": 1073.74,
+      "naive_real_gpu_mb": 1073.75,
+      "triton_mb": 1052.77,
+      "naive_compression": 2.0,
+      "triton_compression": 2.04,
+      "prefill_ms": 1626.9
     },
     {
       "context_len": 32768,
       "peak_memory_gb": "OOM",
+      "fp16_mb": 4294.97,
+      "note": "FP16 OOM"
     }
   ]
 }

results/mistral-7b/long_context_results.json CHANGED Viewed

@@ -6,70 +6,77 @@
       "peak_memory_gb": 14.63,
       "fp16_mb": 67.11,
       "uniform8_mb": 33.55,
-      "mixed_precision_mb": 29.17,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 57.0
     },
     {
       "context_len": 1024,
       "peak_memory_gb": 14.76,
       "fp16_mb": 134.22,
       "uniform8_mb": 67.11,
-      "mixed_precision_mb": 58.33,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 85.1
     },
     {
       "context_len": 2048,
       "peak_memory_gb": 15.02,
       "fp16_mb": 268.44,
       "uniform8_mb": 134.22,
-      "mixed_precision_mb": 116.66,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 165.6
     },
     {
       "context_len": 4096,
       "peak_memory_gb": 15.53,
       "fp16_mb": 536.87,
       "uniform8_mb": 268.44,
-      "mixed_precision_mb": 233.31,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 333.1
     },
     {
       "context_len": 8192,
       "peak_memory_gb": 16.56,
       "fp16_mb": 1073.74,
       "uniform8_mb": 536.87,
-      "mixed_precision_mb": 466.62,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 700.6
     },
     {
       "context_len": 16384,
       "peak_memory_gb": 18.61,
       "fp16_mb": 2147.48,
       "uniform8_mb": 1073.74,
-      "mixed_precision_mb": 933.24,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 1554.1
     },
     {
       "context_len": 32768,
       "peak_memory_gb": 22.71,
       "fp16_mb": 4294.97,
       "uniform8_mb": 2147.48,
-      "mixed_precision_mb": 1866.47,
-      "compression_vs_fp16": 2.3,
-      "compression_vs_8bit": 1.15,
-      "prefill_ms": 3807.8
     }
   ]
 }

       "peak_memory_gb": 14.63,
       "fp16_mb": 67.11,
       "uniform8_mb": 33.55,
+      "naive_real_gpu_mb": 33.56,
+      "triton_mb": 29.17,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 52.6
     },
     {
       "context_len": 1024,
       "peak_memory_gb": 14.76,
       "fp16_mb": 134.22,
       "uniform8_mb": 67.11,
+      "naive_real_gpu_mb": 67.11,
+      "triton_mb": 58.33,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 85.2
     },
     {
       "context_len": 2048,
       "peak_memory_gb": 15.02,
       "fp16_mb": 268.44,
       "uniform8_mb": 134.22,
+      "naive_real_gpu_mb": 134.22,
+      "triton_mb": 116.66,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 164.7
     },
     {
       "context_len": 4096,
       "peak_memory_gb": 15.53,
       "fp16_mb": 536.87,
       "uniform8_mb": 268.44,
+      "naive_real_gpu_mb": 268.44,
+      "triton_mb": 233.31,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 332.8
     },
     {
       "context_len": 8192,
       "peak_memory_gb": 16.56,
       "fp16_mb": 1073.74,
       "uniform8_mb": 536.87,
+      "naive_real_gpu_mb": 536.88,
+      "triton_mb": 466.62,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 701.5
     },
     {
       "context_len": 16384,
       "peak_memory_gb": 18.61,
       "fp16_mb": 2147.48,
       "uniform8_mb": 1073.74,
+      "naive_real_gpu_mb": 1073.75,
+      "triton_mb": 933.24,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 1599.1
     },
     {
       "context_len": 32768,
       "peak_memory_gb": 22.71,
       "fp16_mb": 4294.97,
       "uniform8_mb": 2147.48,
+      "naive_real_gpu_mb": 2147.49,
+      "triton_mb": 1866.47,
+      "naive_compression": 2.0,
+      "triton_compression": 2.3,
+      "prefill_ms": 3810.8
     }
   ]
 }

visualize_long_context.py CHANGED Viewed

@@ -1,9 +1,8 @@
 """
-Long context visualization — both models.
 """
 import json
 import matplotlib.pyplot as plt
-import matplotlib.ticker as ticker
 import os
 def load_long(model_name):
@@ -20,131 +19,118 @@ llama   = load_long("llama-3-8b")
 C_FP16    = "#ef4444"
 C_UNIFORM = "#f97316"
 C_MISTRAL = "#22c55e"
 C_LLAMA   = "#3b82f6"
-# ── GRAPH 1: Both Models Side by Side ─────────────────
 fig, axes = plt.subplots(1, 2, figsize=(18, 7))
-for ax, data, color, title, oom_ctx in [
-    (axes[0], mistral, C_MISTRAL, "Mistral-7B", None),
-    (axes[1], llama,   C_LLAMA,   "Llama-3-8B", 32768),
 ]:
-    valid = [r for r in data["results"] if "mixed_precision_mb" in r]
-    ctx   = [r["context_len"]        for r in valid]
-    fp16  = [r["fp16_mb"]            for r in valid]
-    uni8  = [r["uniform8_mb"]        for r in valid]
-    ours  = [r["mixed_precision_mb"] for r in valid]
-    ax.plot(ctx, fp16, 'o-', color=C_FP16,    linewidth=3, markersize=9, label="FP16 Baseline")
-    ax.plot(ctx, uni8, 's-', color=C_UNIFORM,  linewidth=3, markersize=9, label="Uniform 8-bit")
-    ax.plot(ctx, ours, '^-', color=color,       linewidth=3, markersize=9, label="Per-Head Mixed (Ours)")
-    ax.fill_between(ctx, fp16, ours, alpha=0.08, color=color)
-    # OOM marker
-    if oom_ctx:
-        ax.axvline(x=ctx[-1], color=C_FP16, linestyle='--', alpha=0.5)
-        ax.text(ctx[-1]*0.92, max(fp16)*0.85,
-                "FP16\nOOM →", color=C_FP16,
-                fontweight='bold', fontsize=10, ha='right')
-        # show where ours would be at 32K
-        ours_32k = ours[-1] * 2
-        ax.annotate(f"Ours at 32K:\n~{ours_32k:.0f}MB ✅",
-                    xy=(ctx[-1], ours[-1]),
-                    xytext=(ctx[-2], ours[-1]+200),
-                    color=color, fontweight='bold', fontsize=9,
-                    arrowprops=dict(arrowstyle='->', color=color))
-    # annotate last valid point
     ax.annotate(f"{fp16[-1]/1024:.1f} GB",
                 xy=(ctx[-1], fp16[-1]),
-                xytext=(-40, 10), textcoords='offset points',
                 color=C_FP16, fontweight='bold', fontsize=9)
-    ax.annotate(f"{ours[-1]/1024:.1f} GB",
-                xy=(ctx[-1], ours[-1]),
-                xytext=(-40, -20), textcoords='offset points',
-                color=color, fontweight='bold', fontsize=9)
     ax.set_xlabel("Context Length (tokens)", fontsize=12)
     ax.set_ylabel("KV Cache Memory (MB)", fontsize=12)
-    ax.set_title(f"{title}\nKV Cache Memory vs Context Length",
                  fontsize=13, fontweight='bold')
-    ax.legend(fontsize=10)
     ax.grid(True, alpha=0.3)
     ax.set_xticks(ctx)
     ax.set_xticklabels([f"{c//1024}K" if c >= 1024 else str(c) for c in ctx])
-plt.suptitle("Per-Head Mixed-Precision KV Cache — Long Context Benchmark\n"
-             "Llama-3-8B FP16 OOMs at 32K. Our method fits.",
              fontsize=14, fontweight='bold', y=1.02)
 plt.tight_layout()
-plt.savefig(os.path.expanduser("~/kv-hack/figures/long_context_both.png"),
             dpi=150, bbox_inches='tight')
-print("✅ Saved figures/long_context_both.png")
-# ── GRAPH 2: The OOM Story ────────────────────────────
-fig, ax = plt.subplots(figsize=(12, 6))
-# project to 32K for both
-all_ctx  = [512, 1024, 2048, 4096, 8192, 16384, 32768]
-# mistral has all points
-m_fp16  = [r["fp16_mb"] for r in mistral["results"] if "fp16_mb" in r]
-m_ours  = [r["mixed_precision_mb"] for r in mistral["results"]
-           if "mixed_precision_mb" in r]
-m_ctx   = [r["context_len"] for r in mistral["results"]
-           if "mixed_precision_mb" in r]
-# llama valid points
-l_valid = [r for r in llama["results"] if "mixed_precision_mb" in r]
-l_fp16  = [r["fp16_mb"] for r in l_valid]
-l_ours  = [r["mixed_precision_mb"] for r in l_valid]
-l_ctx   = [r["context_len"] for r in l_valid]
-# A100 40GB memory limit line (minus model weights)
-mistral_model_mem = 14.5 * 1024  # MB
-llama_model_mem   = 16.0 * 1024  # MB
-a100_total        = 40 * 1024    # MB
-ax.axhline(y=a100_total - mistral_model_mem,
-           color='gray', linestyle='--', alpha=0.7, linewidth=2,
-           label=f"A100 headroom (Mistral): {(a100_total-mistral_model_mem)/1024:.0f}GB")
-ax.axhline(y=a100_total - llama_model_mem,
-           color='gray', linestyle=':', alpha=0.7, linewidth=2,
-           label=f"A100 headroom (Llama): {(a100_total-llama_model_mem)/1024:.0f}GB")
-ax.plot(m_ctx, m_fp16, 'o-',  color=C_FP16,    linewidth=2.5, markersize=7, label="FP16 (Mistral)")
-ax.plot(m_ctx, m_ours, '^-',  color=C_MISTRAL,  linewidth=2.5, markersize=7, label="Ours (Mistral)")
-ax.plot(l_ctx, l_fp16, 'o--', color="#f87171",  linewidth=2.5, markersize=7, label="FP16 (Llama)")
-ax.plot(l_ctx, l_ours, '^--', color=C_LLAMA,    linewidth=2.5, markersize=7, label="Ours (Llama)")
-# OOM annotation
-ax.annotate("Llama FP16\nOOM here ❌",
-            xy=(16384, l_fp16[-1]),
-            xytext=(12000, l_fp16[-1]+400),
-            color=C_FP16, fontweight='bold', fontsize=10,
-            arrowprops=dict(arrowstyle='->', color=C_FP16))
-ax.set_xlabel("Context Length (tokens)", fontsize=13)
 ax.set_ylabel("KV Cache Memory (MB)", fontsize=13)
-ax.set_title("KV Cache Memory vs GPU Headroom\n"
-             "Our method keeps you under the limit longer",
              fontsize=14, fontweight='bold')
-ax.legend(fontsize=10, loc='upper left')
-ax.grid(True, alpha=0.3)
-ax.set_xticks(m_ctx)
-ax.set_xticklabels(["512","1K","2K","4K","8K","16K","32K"])
 plt.tight_layout()
-plt.savefig(os.path.expanduser("~/kv-hack/figures/oom_story.png"),
             dpi=150, bbox_inches='tight')
-print("✅ Saved figures/oom_story.png")
-# ── GRAPH 3: Prefill Latency Both Models ─────────────
 fig, ax = plt.subplots(figsize=(10, 5))
-m_prefill = [r["prefill_ms"] for r in mistral["results"] if "prefill_ms" in r]
-l_prefill = [r["prefill_ms"] for r in llama["results"]   if "prefill_ms" in r]
 ax.plot(m_ctx, m_prefill, 'o-', color=C_MISTRAL, linewidth=2.5,
         markersize=8, label="Mistral-7B")
@@ -162,12 +148,12 @@ for x, y in zip(l_ctx, l_prefill):
 ax.set_xlabel("Context Length (tokens)", fontsize=13)
 ax.set_ylabel("Prefill Latency (ms)", fontsize=13)
-ax.set_title("Prefill Latency vs Context Length\nBoth Models",
              fontsize=14, fontweight='bold')
 ax.legend(fontsize=11)
 ax.grid(True, alpha=0.3)
 ax.set_xticks(m_ctx)
-ax.set_xticklabels(["512","1K","2K","4K","8K","16K","32K"])
 plt.tight_layout()
 plt.savefig(os.path.expanduser("~/kv-hack/figures/prefill_latency_both.png"),
             dpi=150, bbox_inches='tight')

 """
+Long context visualization — 4 methods comparison.
 """
 import json
 import matplotlib.pyplot as plt
 import os
 def load_long(model_name):
 C_FP16    = "#ef4444"
 C_UNIFORM = "#f97316"
+C_NAIVE   = "#a855f7"
 C_MISTRAL = "#22c55e"
 C_LLAMA   = "#3b82f6"
+# ── GRAPH 1: Both Models 4 Methods ───────────────────
 fig, axes = plt.subplots(1, 2, figsize=(18, 7))
+for ax, data, triton_color, title in [
+    (axes[0], mistral, C_MISTRAL, "Mistral-7B"),
+    (axes[1], llama,   C_LLAMA,   "Llama-3-8B"),
 ]:
+    valid = [r for r in data["results"] if "triton_mb" in r]
+    ctx   = [r["context_len"]       for r in valid]
+    fp16  = [r["fp16_mb"]           for r in valid]
+    uni8  = [r["uniform8_mb"]       for r in valid]
+    naive = [r["naive_real_gpu_mb"] for r in valid]
+    triton= [r["triton_mb"]         for r in valid]
+    ax.plot(ctx, fp16,   'o-', color=C_FP16,    linewidth=3, markersize=9, label="FP16 Baseline")
+    ax.plot(ctx, uni8,   's-', color=C_UNIFORM,  linewidth=3, markersize=9, label="Uniform 8-bit")
+    ax.plot(ctx, naive,  'D-', color=C_NAIVE,    linewidth=3, markersize=9, label="Naive Per-Head (uint8)")
+    ax.plot(ctx, triton, '^-', color=triton_color, linewidth=3, markersize=9, label="Triton True 4-bit (Ours)")
+    ax.fill_between(ctx, fp16, triton, alpha=0.07, color=triton_color)
+    # annotate last point
     ax.annotate(f"{fp16[-1]/1024:.1f} GB",
                 xy=(ctx[-1], fp16[-1]),
+                xytext=(-50, 10), textcoords='offset points',
                 color=C_FP16, fontweight='bold', fontsize=9)
+    ax.annotate(f"{uni8[-1]/1024:.1f} GB",
+                xy=(ctx[-1], uni8[-1]),
+                xytext=(-50, 10), textcoords='offset points',
+                color=C_UNIFORM, fontweight='bold', fontsize=9)
+    ax.annotate(f"{naive[-1]/1024:.1f} GB",
+                xy=(ctx[-1], naive[-1]),
+                xytext=(-50, -18), textcoords='offset points',
+                color=C_NAIVE, fontweight='bold', fontsize=9)
+    ax.annotate(f"{triton[-1]/1024:.1f} GB\n({valid[-1]['triton_compression']}x)",
+                xy=(ctx[-1], triton[-1]),
+                xytext=(-80, -35), textcoords='offset points',
+                color=triton_color, fontweight='bold', fontsize=9)
+    # OOM marker for llama
+    if title == "Llama-3-8B":
+        ax.axvline(x=ctx[-1], color=C_FP16, linestyle='--', alpha=0.5)
+        ax.text(ctx[-1]*0.88, max(fp16)*0.88,
+                "FP16\nOOM →", color=C_FP16,
+                fontweight='bold', fontsize=10, ha='right')
     ax.set_xlabel("Context Length (tokens)", fontsize=12)
     ax.set_ylabel("KV Cache Memory (MB)", fontsize=12)
+    ax.set_title(f"{title}\nKV Cache Memory vs Context Length (4 Methods)",
                  fontsize=13, fontweight='bold')
+    ax.legend(fontsize=10, loc='upper left')
     ax.grid(True, alpha=0.3)
     ax.set_xticks(ctx)
     ax.set_xticklabels([f"{c//1024}K" if c >= 1024 else str(c) for c in ctx])
+plt.suptitle("Per-Head Mixed-Precision KV Cache — Long Context Benchmark",
              fontsize=14, fontweight='bold', y=1.02)
 plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/long_context_4methods.png"),
             dpi=150, bbox_inches='tight')
+print("✅ Saved figures/long_context_4methods.png")
+# ── GRAPH 2: The savings story at 32K ─────────────────
+fig, ax = plt.subplots(figsize=(10, 6))
+# use mistral 32K numbers
+r32 = next(r for r in mistral["results"] if r["context_len"] == 32768)
+methods = ["FP16\nBaseline", "Uniform\n8-bit", "Naive Per-Head\n(uint8)", "Triton True\n4-bit (Ours)"]
+values  = [r32["fp16_mb"], r32["uniform8_mb"], r32["naive_real_gpu_mb"], r32["triton_mb"]]
+colors  = [C_FP16, C_UNIFORM, C_NAIVE, C_MISTRAL]
+bars = ax.bar(methods, values, color=colors, width=0.5,
+              edgecolor='white', linewidth=2)
+for bar, val in zip(bars, values):
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 30,
+            f"{val/1024:.1f} GB", ha='center',
+            fontweight='bold', fontsize=12)
+# savings arrows
+ax.annotate('', xy=(3, r32["triton_mb"]),
+            xytext=(0, r32["fp16_mb"]),
+            arrowprops=dict(arrowstyle='<->', color='gray', lw=2))
+ax.text(1.5, (r32["fp16_mb"] + r32["triton_mb"])/2,
+        f"Save {(r32['fp16_mb']-r32['triton_mb'])/1024:.1f} GB\n({r32['triton_compression']}x)",
+        ha='center', color='gray', fontweight='bold', fontsize=11)
 ax.set_ylabel("KV Cache Memory (MB)", fontsize=13)
+ax.set_title("KV Cache Memory at 32K Context — Mistral-7B\nTriton saves 2.4GB vs FP16 baseline",
              fontsize=14, fontweight='bold')
+ax.grid(True, axis='y', alpha=0.3)
 plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/memory_32k_4methods.png"),
             dpi=150, bbox_inches='tight')
+print("✅ Saved figures/memory_32k_4methods.png")
+# ── GRAPH 3: Prefill Latency Both Models ──────────────
 fig, ax = plt.subplots(figsize=(10, 5))
+m_valid   = [r for r in mistral["results"] if "prefill_ms" in r]
+l_valid   = [r for r in llama["results"]   if "prefill_ms" in r]
+m_ctx     = [r["context_len"] for r in m_valid]
+l_ctx     = [r["context_len"] for r in l_valid]
+m_prefill = [r["prefill_ms"]  for r in m_valid]
+l_prefill = [r["prefill_ms"]  for r in l_valid]
 ax.plot(m_ctx, m_prefill, 'o-', color=C_MISTRAL, linewidth=2.5,
         markersize=8, label="Mistral-7B")
 ax.set_xlabel("Context Length (tokens)", fontsize=13)
 ax.set_ylabel("Prefill Latency (ms)", fontsize=13)
+ax.set_title("Prefill Latency vs Context Length — Both Models",
              fontsize=14, fontweight='bold')
 ax.legend(fontsize=11)
 ax.grid(True, alpha=0.3)
 ax.set_xticks(m_ctx)
+ax.set_xticklabels([f"{c//1024}K" if c >= 1024 else str(c) for c in m_ctx])
 plt.tight_layout()
 plt.savefig(os.path.expanduser("~/kv-hack/figures/prefill_latency_both.png"),
             dpi=150, bbox_inches='tight')

visualize_results.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Generate all publication-ready graphs for both models.
 """
 import json
 import matplotlib.pyplot as plt
@@ -7,7 +7,9 @@ import numpy as np
 import os
 def load_results(model_name):
-    path = os.path.expanduser(f"~/kv-hack/results/{model_name}/benchmark_results.json")
     with open(path) as f:
         return json.load(f)
@@ -16,37 +18,44 @@ llama   = load_results("llama-3-8b")
 C_FP16    = "#ef4444"
 C_UNIFORM = "#f97316"
-C_MISTRAL = "#22c55e"
 C_LLAMA   = "#3b82f6"
 os.makedirs(os.path.expanduser("~/kv-hack/figures"), exist_ok=True)
-# ── GRAPH 1: Memory vs Context — Both Models ──────────
-fig, axes = plt.subplots(1, 2, figsize=(16, 6))
-for ax, results, title in [
-    (axes[0], mistral, "Mistral-7B"),
-    (axes[1], llama,   "Llama-3-8B"),
 ]:
-    ctx  = [r["context_len"]        for r in results["compression"]]
-    fp16 = [r["fp16_mb"]            for r in results["compression"]]
-    uni8 = [r["uniform8_mb"]        for r in results["compression"]]
-    ours = [r["mixed_precision_mb"] for r in results["compression"]]
-    ax.plot(ctx, fp16, 'o-', color=C_FP16,    linewidth=2.5, markersize=8, label="FP16 Baseline")
-    ax.plot(ctx, uni8, 's-', color=C_UNIFORM,  linewidth=2.5, markersize=8, label="Uniform 8-bit")
-    ax.plot(ctx, ours, '^-', color=C_MISTRAL if title == "Mistral-7B" else C_LLAMA,
-            linewidth=2.5, markersize=8, label="Per-Head Mixed (Ours)")
     # annotate at 8K
-    ax.annotate(f"{fp16[-1]:.0f} MB", xy=(8192, fp16[-1]),
-                xytext=(5500, fp16[-1]+30), color=C_FP16, fontweight='bold', fontsize=9)
-    ax.annotate(f"{uni8[-1]:.0f} MB", xy=(8192, uni8[-1]),
-                xytext=(5500, uni8[-1]+30), color=C_UNIFORM, fontweight='bold', fontsize=9)
-    ax.annotate(f"{ours[-1]:.0f} MB\n({results['summary']['compression_8k']}x vs FP16)",
-                xy=(8192, ours[-1]), xytext=(4000, ours[-1]-150),
-                color=C_MISTRAL if title == "Mistral-7B" else C_LLAMA,
-                fontweight='bold', fontsize=9)
     ax.set_xlabel("Context Length (tokens)", fontsize=12)
     ax.set_ylabel("KV Cache Memory (MB)", fontsize=12)
@@ -54,62 +63,93 @@ for ax, results, title in [
     ax.legend(fontsize=10)
     ax.grid(True, alpha=0.3)
     ax.set_xticks(ctx)
-plt.suptitle("Per-Head Mixed-Precision KV Cache Compression",
-             fontsize=15, fontweight='bold', y=1.02)
 plt.tight_layout()
-plt.savefig(os.path.expanduser("~/kv-hack/figures/memory_vs_context_both.png"),
             dpi=150, bbox_inches='tight')
-print("✅ Saved figures/memory_vs_context_both.png")
-# ── GRAPH 2: Compression Bar Chart — Both Models ──────
-fig, ax = plt.subplots(figsize=(10, 6))
-x      = np.arange(3)
 width  = 0.35
-models = ["FP16\nBaseline", "Uniform\n8-bit", "Per-Head\nMixed (Ours)"]
-bars1 = ax.bar(x - width/2,
-               [1.0, 2.0, mistral["summary"]["compression_8k"]],
-               width, label="Mistral-7B", color=C_MISTRAL, edgecolor='white')
-bars2 = ax.bar(x + width/2,
-               [1.0, 2.0, llama["summary"]["compression_8k"]],
-               width, label="Llama-3-8B",  color=C_LLAMA,   edgecolor='white')
-for bar in bars1:
     ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
-            f"{bar.get_height():.2f}x", ha='center', fontweight='bold', fontsize=11)
-for bar in bars2:
     ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
-            f"{bar.get_height():.2f}x", ha='center', fontweight='bold', fontsize=11)
 ax.set_xticks(x)
-ax.set_xticklabels(models, fontsize=12)
 ax.set_ylabel("Compression vs FP16", fontsize=13)
-ax.set_title("KV Cache Compression at 8K Context\nPer-Head Mixed Precision vs Baselines",
              fontsize=14, fontweight='bold')
 ax.set_ylim(0, 2.8)
-ax.legend(fontsize=12)
 ax.grid(True, axis='y', alpha=0.3)
 ax.axhline(y=1.0, color='gray', linestyle='--', alpha=0.4)
 plt.tight_layout()
-plt.savefig(os.path.expanduser("~/kv-hack/figures/compression_bar_both.png"), dpi=150)
-print("✅ Saved figures/compression_bar_both.png")
-# ── GRAPH 3: Hero Summary Table ───────────────────────
-fig, ax = plt.subplots(figsize=(12, 4))
 ax.axis('off')
 table_data = [
-    ["Model", "Method", "Avg Bits", "KV @ 8K", "vs FP16", "vs 8-bit", "Perplexity", "Speed"],
-    ["Mistral-7B", "FP16 Baseline",        "16",   "1073 MB", "1.0x", "—",     str(mistral["perplexity"]), f"{mistral['decode_tokens_per_sec']} t/s"],
-    ["Mistral-7B", "Uniform 8-bit",         "8",    "537 MB",  "2.0x", "1.0x",  "~same",    "~same"],
-    ["Mistral-7B", "Per-Head Mixed (Ours)", f"{mistral['avg_bits']}", f"{mistral['summary']['ours_8k_mb']} MB", f"{mistral['summary']['compression_8k']}x", "1.15x", "14.23 (±0.00)", f"{mistral['decode_tokens_per_sec']} t/s"],
-    ["Llama-3-8B", "FP16 Baseline",        "16",   "1073 MB", "1.0x", "—",     str(llama["perplexity"]),   f"{llama['decode_tokens_per_sec']} t/s"],
-    ["Llama-3-8B", "Uniform 8-bit",         "8",    "537 MB",  "2.0x", "1.0x",  "~same",                   "~same"],
-    ["Llama-3-8B", "Per-Head Mixed (Ours)", f"{llama['avg_bits']}",   f"{llama['summary']['ours_8k_mb']} MB",   f"{llama['summary']['compression_8k']}x", "1.02x", "20.70 (±0.00)",   f"{llama['decode_tokens_per_sec']} t/s"],
 ]
 table = ax.table(
@@ -120,24 +160,20 @@ table = ax.table(
 )
 table.auto_set_font_size(False)
 table.set_fontsize(9)
-table.scale(1.2, 2.2)
-# style header
-for j in range(8):
     table[0, j].set_facecolor("#1e293b")
     table[0, j].set_text_props(color='white', fontweight='bold')
-# highlight our rows green
-for j in range(8):
-    table[3, j].set_facecolor("#dcfce7")
-    table[6, j].set_facecolor("#dbeafe")
-plt.title("Full Results — Per-Head Mixed-Precision KV Cache",
           fontsize=13, fontweight='bold', pad=20)
 plt.tight_layout()
-plt.savefig(os.path.expanduser("~/kv-hack/figures/results_table_both.png"),
             dpi=150, bbox_inches='tight')
-print("✅ Saved figures/results_table_both.png")
 plt.close('all')
-print("\n🎉 All graphs saved to ~/kv-hack/figures/")

 """
+Generate publication-ready graphs — 4 methods comparison.
 """
 import json
 import matplotlib.pyplot as plt
 import os
 def load_results(model_name):
+    path = os.path.expanduser(
+        f"~/kv-hack/results/{model_name}/benchmark_results.json"
+    )
     with open(path) as f:
         return json.load(f)
 C_FP16    = "#ef4444"
 C_UNIFORM = "#f97316"
+C_NAIVE   = "#a855f7"
+C_TRITON  = "#22c55e"
 C_LLAMA   = "#3b82f6"
 os.makedirs(os.path.expanduser("~/kv-hack/figures"), exist_ok=True)
+# ── GRAPH 1: Memory vs Context — Mistral 4 methods ───
+fig, axes = plt.subplots(1, 2, figsize=(18, 7))
+for ax, results, title, triton_color in [
+    (axes[0], mistral, "Mistral-7B", C_TRITON),
+    (axes[1], llama,   "Llama-3-8B", C_LLAMA),
 ]:
+    ctx    = [r["context_len"]        for r in results["compression"]]
+    fp16   = [r["fp16_mb"]            for r in results["compression"]]
+    uni8   = [r["uniform8_mb"]        for r in results["compression"]]
+    naive  = [r["naive_real_gpu_mb"]  for r in results["compression"]]
+    triton = [r["triton_mb"]          for r in results["compression"]]
+    ax.plot(ctx, fp16,   'o-', color=C_FP16,    linewidth=2.5, markersize=8, label="FP16 Baseline")
+    ax.plot(ctx, uni8,   's-', color=C_UNIFORM,  linewidth=2.5, markersize=8, label="Uniform 8-bit")
+    ax.plot(ctx, naive,  'D-', color=C_NAIVE,    linewidth=2.5, markersize=8, label="Naive Per-Head (uint8)")
+    ax.plot(ctx, triton, '^-', color=triton_color, linewidth=2.5, markersize=8, label="Triton True 4-bit (Ours)")
     # annotate at 8K
+    s = results["summary"]
+    ax.annotate(f"{fp16[-1]:.0f} MB",
+                xy=(8192, fp16[-1]), xytext=(-60, 10),
+                textcoords='offset points', color=C_FP16, fontweight='bold', fontsize=9)
+    ax.annotate(f"{uni8[-1]:.0f} MB",
+                xy=(8192, uni8[-1]), xytext=(-60, 10),
+                textcoords='offset points', color=C_UNIFORM, fontweight='bold', fontsize=9)
+    ax.annotate(f"{naive[-1]:.0f} MB",
+                xy=(8192, naive[-1]), xytext=(-60, -18),
+                textcoords='offset points', color=C_NAIVE, fontweight='bold', fontsize=9)
+    ax.annotate(f"{triton[-1]:.0f} MB\n({s['triton_compression_8k']}x)",
+                xy=(8192, triton[-1]), xytext=(-80, -35),
+                textcoords='offset points', color=triton_color, fontweight='bold', fontsize=9)
     ax.set_xlabel("Context Length (tokens)", fontsize=12)
     ax.set_ylabel("KV Cache Memory (MB)", fontsize=12)
     ax.legend(fontsize=10)
     ax.grid(True, alpha=0.3)
     ax.set_xticks(ctx)
+    ax.set_xticklabels(["512", "1K", "2K", "4K", "8K"])
+plt.suptitle("Per-Head Mixed-Precision KV Cache — 4 Method Comparison",
+             fontsize=14, fontweight='bold', y=1.02)
 plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/memory_vs_context_4methods.png"),
             dpi=150, bbox_inches='tight')
+print("✅ Saved figures/memory_vs_context_4methods.png")
+# ── GRAPH 2: Compression Bar Chart — 4 Methods ────────
+fig, ax = plt.subplots(figsize=(12, 7))
+x      = np.arange(4)
 width  = 0.35
+labels = ["FP16\nBaseline", "Uniform\n8-bit", "Naive Per-Head\n(uint8 actual)", "Triton True\n4-bit (Ours)"]
+m_ratios = [
+    1.0,
+    2.0,
+    mistral["summary"]["naive_real_compression_8k"],
+    mistral["summary"]["triton_compression_8k"],
+]
+l_ratios = [
+    1.0,
+    2.0,
+    llama["summary"]["naive_real_compression_8k"],
+    llama["summary"]["triton_compression_8k"],
+]
+colors = [C_FP16, C_UNIFORM, C_NAIVE, C_TRITON]
+bars1 = ax.bar(x - width/2, m_ratios, width,
+               label="Mistral-7B", color=colors,
+               edgecolor='white', linewidth=1.5, alpha=0.9)
+bars2 = ax.bar(x + width/2, l_ratios, width,
+               label="Llama-3-8B", color=colors,
+               edgecolor='white', linewidth=1.5, alpha=0.6,
+               hatch='//')
+for bar, ratio in zip(bars1, m_ratios):
     ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
+            f"{ratio:.2f}x", ha='center', fontweight='bold', fontsize=11)
+for bar, ratio in zip(bars2, l_ratios):
     ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.03,
+            f"{ratio:.2f}x", ha='center', fontweight='bold', fontsize=10,
+            color='gray')
 ax.set_xticks(x)
+ax.set_xticklabels(labels, fontsize=11)
 ax.set_ylabel("Compression vs FP16", fontsize=13)
+ax.set_title("KV Cache Compression at 8K Context\n4-Method Comparison — Mistral-7B vs Llama-3-8B",
              fontsize=14, fontweight='bold')
 ax.set_ylim(0, 2.8)
+ax.legend(fontsize=11)
 ax.grid(True, axis='y', alpha=0.3)
 ax.axhline(y=1.0, color='gray', linestyle='--', alpha=0.4)
+# highlight our method
+ax.add_patch(plt.Rectangle((2.5, 0), 1.0, 2.8,
+             alpha=0.05, color=C_TRITON, zorder=0))
+ax.text(3.0, 2.65, "Our method", ha='center',
+        color=C_TRITON, fontweight='bold', fontsize=10)
 plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/compression_bar_4methods.png"),
+            dpi=150, bbox_inches='tight')
+print("✅ Saved figures/compression_bar_4methods.png")
+# ── GRAPH 3: Full Results Table ────────────────────────
+fig, ax = plt.subplots(figsize=(14, 5))
 ax.axis('off')
+s_m = mistral["summary"]
+s_l = llama["summary"]
 table_data = [
+    ["Model", "Method", "KV @ 8K", "vs FP16", "vs 8-bit", "Perplexity", "Speed"],
+    ["Mistral-7B", "FP16 Baseline",           "1073 MB", "1.00x", "—",      "14.23", "37.4 t/s"],
+    ["Mistral-7B", "Uniform 8-bit",            "537 MB",  "2.00x", "1.00x", "~same", "~same"],
+    ["Mistral-7B", "Naive Per-Head (uint8)",   f"{s_m['naive_real_8k_mb']} MB", f"{s_m['naive_real_compression_8k']}x", "1.00x", "~same", "~same"],
+    ["Mistral-7B", "Triton True 4-bit (Ours)", f"{s_m['triton_8k_mb']} MB", f"{s_m['triton_compression_8k']}x", f"{s_m['triton_vs_8bit_8k']}x", "14.23", "37.4 t/s"],
+    ["Llama-3-8B", "FP16 Baseline",            "1073 MB", "1.00x", "—",     "20.70", "36.8 t/s"],
+    ["Llama-3-8B", "Uniform 8-bit",            "537 MB",  "2.00x", "1.00x", "~same", "~same"],
+    ["Llama-3-8B", "Naive Per-Head (uint8)",   f"{s_l['naive_real_8k_mb']} MB", f"{s_l['naive_real_compression_8k']}x", "1.00x", "~same", "~same"],
+    ["Llama-3-8B", "Triton True 4-bit (Ours)", f"{s_l['triton_8k_mb']} MB", f"{s_l['triton_compression_8k']}x", f"{s_l['triton_vs_8bit_8k']}x", "20.70", "36.8 t/s"],
 ]
 table = ax.table(
 )
 table.auto_set_font_size(False)
 table.set_fontsize(9)
+table.scale(1.2, 2.0)
+for j in range(7):
     table[0, j].set_facecolor("#1e293b")
     table[0, j].set_text_props(color='white', fontweight='bold')
+    table[4, j].set_facecolor("#dcfce7")  # Mistral Triton row
+    table[8, j].set_facecolor("#dbeafe")  # Llama Triton row
+plt.title("Full Results — Per-Head Mixed-Precision KV Cache (4 Methods)",
           fontsize=13, fontweight='bold', pad=20)
 plt.tight_layout()
+plt.savefig(os.path.expanduser("~/kv-hack/figures/results_table_4methods.png"),
             dpi=150, bbox_inches='tight')
+print("✅ Saved figures/results_table_4methods.png")
 plt.close('all')
+print("\n🎉 All 4-method graphs saved!")