Spaces:

AIencoder
/

turboquant-visualizer

Running

App Files Files Community

AIencoder commited on 11 days ago

Commit

4ef7879

verified ·

1 Parent(s): ff5bd9d

initial: TurboQuant visualizer (rotation effect on quantization)

Browse files

Files changed (5) hide show

README.md +42 -6
app.py +121 -0
bench.py +122 -0
hadamard.py +63 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,12 +1,48 @@
 ---
-title: Turboquant Visualizer
-emoji: 🚀
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 6.13.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TurboQuant Visualizer
+emoji: 🌀
+colorFrom: indigo
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
+short_description: Visualize how Hadamard rotation Gaussianizes LLM weights
 ---
+# TurboQuant Visualizer
+Interactive demo of the offline weight-rotation step at the heart of
+[turbocpp](https://github.com/Ary5272/turbocpp). Drag the sliders to see
+how a Walsh-Hadamard transform reshapes a heavy-tailed LLM weight
+distribution into a near-Gaussian one — which is the exact distribution
+shape that Q4 / Q4_K / Q3 quantization handles best.
+## What you're looking at
+| panel | what |
+|---|---|
+| left   | raw synthetic weight (Gaussian bulk + ~5σ outliers — typical of LLaMA-style weights) |
+| middle | same weight after block-Hadamard rotation; bulk is preserved, tails collapse into the Gaussian |
+| right  | per-block max-abs distributions overlaid — the rotation makes each block's max-abs smaller and tighter, which is exactly what controls Q4 rounding error |
+The text panel reports MSE at Q4 / Q3 / Q2 with and without rotation,
+plus the implied "drop a tier and run faster" speed estimate.
+## How to deploy this Space
+1. Create a new Space at https://huggingface.co/new-space (Gradio SDK).
+2. Copy `app.py`, `requirements.txt`, and this `README.md` into the
+   Space's repo.
+3. Also copy `turboquant/hadamard.py` and `turboquant/bench.py` (or run
+   `pip install git+https://github.com/Ary5272/turbocpp` from inside
+   the Space's `requirements.txt`).
+4. Push — HF builds the image automatically.
+## Local
+```bash
+pip install -e ".[demo]"
+python -m space.app
+```

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""TurboQuant Visualizer — HuggingFace Space (Gradio).
+Interactive demo showing what the Hadamard rotation actually does to a
+weight tensor's quantization-error distribution. Three side-by-side
+plots:
+   1. raw weight histogram (heavy tail)
+   2. rotated weight histogram (Gaussianized)
+   3. per-block max-abs before vs after rotation
+Plus a numeric summary: MSE at Q4 / Q3 / Q2, with and without rotation,
+and the implied "drop a tier and run faster" speed-up estimate.
+"""
+import io
+import gradio as gr
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from bench import heavy_tailed_weight, measure
+from hadamard import block_hadamard_inplace
+def _plot(W_raw: torch.Tensor, W_rot: torch.Tensor, block: int) -> "PIL.Image":
+    fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
+    raw = W_raw.flatten().numpy()
+    rot = W_rot.flatten().numpy()
+    bins = np.linspace(-0.5, 0.5, 121)
+    axes[0].hist(raw, bins=bins, color="#888", alpha=0.85)
+    axes[0].set_title("Raw weights — heavy-tailed")
+    axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log")
+    axes[1].hist(rot, bins=bins, color="#3B82F6", alpha=0.85)
+    axes[1].set_title("After block-Hadamard — Gaussianized")
+    axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log")
+    raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy()
+    rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy()
+    axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw",     color="#888")
+    axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6")
+    axes[2].set_title(f"per-{block} block max|w|  (drives Q4 quant step)")
+    axes[2].legend()
+    fig.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=110)
+    plt.close(fig)
+    buf.seek(0)
+    from PIL import Image
+    return Image.open(buf)
+def run(rows: int, cols: int, block: int, seed: int):
+    W = heavy_tailed_weight(n_rows=int(rows), n_cols=int(cols), seed=int(seed))
+    W_rot = W.clone().double()
+    block_hadamard_inplace(W_rot, axis=-1, block=int(block))
+    # Quantization MSE
+    bench_lines = []
+    for bits in (4, 3, 2):
+        s_base = measure(W, bits=bits, rotated=False, block=int(block))
+        s_rot  = measure(W, bits=bits, rotated=True,  block=int(block))
+        bench_lines.append(
+            f"  Q{bits}      raw MSE = {s_base.mse:.3e}    "
+            f"TQ MSE = {s_rot.mse:.3e}    "
+            f"× {s_base.mse/max(s_rot.mse,1e-30):.1f} better"
+        )
+    # MSE-matched speed estimate.
+    base_q4 = measure(W, bits=4, rotated=False, block=int(block)).mse
+    speed_msg = "needs a deeper drop"
+    for bits in (3, 2):
+        s = measure(W, bits=bits, rotated=True, block=int(block))
+        if s.mse <= base_q4:
+            ratio = 4.625 / (bits + 1.0)
+            speed_msg = (f"TQ-Q{bits} matches baseline-Q4 quality at "
+                         f"~{ratio:.2f}× less memory bandwidth → faster decode")
+            break
+    summary = (
+        f"weight shape = {rows}×{cols}, block_size = {block}\n"
+        f"per-block max|w|  raw mean  = {W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n"
+        f"per-block max|w|  rot mean  = {W_rot.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n\n"
+        + "\n".join(bench_lines)
+        + "\n\nSpeed: " + speed_msg
+    )
+    return _plot(W, W_rot, int(block)), summary
+demo = gr.Interface(
+    fn=run,
+    title="TurboQuant — Hadamard Rotation Visualizer",
+    description=(
+        "Drag the sliders to see how Walsh-Hadamard rotation reshapes a "
+        "heavy-tailed LLM-style weight distribution. The rotation is "
+        "orthogonal so model fp32 output is unchanged — but quantization "
+        "error drops 3-5× because every block sees a near-Gaussian input. "
+        "[github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
+    ),
+    inputs=[
+        gr.Slider(64,  4096, value=1024, step=64,  label="rows"),
+        gr.Slider(64,  4096, value=4096, step=64,  label="cols"),
+        gr.Slider(32,   256, value=128,  step=32,  label="Hadamard block size"),
+        gr.Slider(0,   1000, value=0,    step=1,   label="seed"),
+    ],
+    outputs=[
+        gr.Image(type="pil", label="distributions"),
+        gr.Textbox(label="quant-error report", lines=10),
+    ],
+    examples=[[1024, 4096, 128, 0], [4096, 4096, 64, 7]],
+)
+if __name__ == "__main__":
+    demo.launch()

bench.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Microbenchmark: TurboQuant rotation effect on Q4_K-style quantization.
+We don't need a full LLM to demonstrate the speed/quality story:
+   - generate a synthetic weight tensor with realistic heavy-tailed stats
+   - quantize it with and without rotation, at Q4 / Q3 / Q2 bit budgets
+   - report reconstruction MSE and effective bits/weight
+The real speedup story (decode tok/s) requires running llama-bench on a
+quantized GGUF — see scripts/bench_e2e.sh for that. This module is the
+quick "did rotation help?" check that runs in 1 second.
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass
+import numpy as np
+import torch
+from hadamard import block_hadamard_inplace
+@dataclass
+class QuantStats:
+    fmt: str
+    bits: float          # effective bits/weight
+    mse: float           # reconstruction error
+    max_abs_err: float
+def _quant_dequant_q(x: torch.Tensor, bits: int, block: int = 32) -> torch.Tensor:
+    """Symmetric block min-max quantization (the same shape llama.cpp's
+    Q4_0 / Q3_0 use, modulo per-block fp16 scale vs fp32). Operates per
+    contiguous `block` along last dim."""
+    n = x.shape[-1]
+    assert n % block == 0
+    levels = (1 << bits) - 1                 # e.g. 15 for 4-bit
+    half   = levels // 2                     # symmetric quant centered at 0
+    flat = x.reshape(-1, n // block, block)
+    maxabs = flat.abs().amax(dim=-1, keepdim=True)
+    d = maxabs / half
+    d = torch.where(d == 0, torch.ones_like(d), d)
+    q = torch.clamp(torch.round(flat / d) + half, 0, levels)
+    rec = (q - half) * d
+    return rec.reshape_as(x)
+def measure(W: torch.Tensor, bits: int, rotated: bool, block: int = 128) -> QuantStats:
+    """Return (effective bpw, MSE, max-abs-err) for `bits`-bit quantization
+    of `W`, optionally Hadamard-rotated first."""
+    x = W.clone().double()
+    if rotated:
+        block_hadamard_inplace(x, axis=-1, block=block)
+    rec = _quant_dequant_q(x, bits, block=32)
+    if rotated:
+        # Inverse rotation to compare in original frame.
+        block_hadamard_inplace(rec, axis=-1, block=block)
+    err = (W.double() - rec)
+    bpw = bits + 32 / 32                  # quants + per-32 fp32 scale
+    return QuantStats(
+        fmt=f"{'TQ-' if rotated else ''}Q{bits}",
+        bits=bpw,
+        mse=err.pow(2).mean().item(),
+        max_abs_err=err.abs().max().item(),
+    )
+def heavy_tailed_weight(n_rows: int = 4096, n_cols: int = 4096, seed: int = 0) -> torch.Tensor:
+    """Synthetic LLM-shaped weight: small Gaussian bulk + occasional tail
+    outliers. Real LLaMA weights look like this — the outliers dominate
+    Q4_0's per-block max-abs and blow up rounding error."""
+    torch.manual_seed(seed)
+    W = 0.02 * torch.randn(n_rows, n_cols)
+    # ~0.5% outliers per row at ~5σ.
+    n_out = max(1, n_cols // 200)
+    rows = torch.randint(0, n_rows, (n_out * n_rows,))
+    cols = torch.randint(0, n_cols, (n_out * n_rows,))
+    sign = torch.randint(0, 2, (rows.shape[0],), dtype=torch.float32) * 2 - 1
+    mag  = 0.3 + 0.4 * torch.rand(rows.shape[0])
+    W[rows, cols] = sign * mag
+    return W
+def run_bench(seed: int = 0) -> None:
+    print("== TurboQuant rotation effect on quantization error ==")
+    print("Synthetic weight: 4096×4096 with ~5σ tail outliers\n")
+    W = heavy_tailed_weight(seed=seed)
+    print(f"{'format':<12}{'bpw':>6}{'MSE':>14}{'max|err|':>12}{'speedup hint':>20}")
+    print("-" * 64)
+    rows = []
+    for bits in (4, 3, 2):
+        s_base = measure(W, bits=bits, rotated=False)
+        s_rot  = measure(W, bits=bits, rotated=True)
+        rows.append((s_base, s_rot))
+        # speedup hint: roughly bytes ratio at decode time vs Q4 baseline
+        speedup_base = 4.625 / s_base.bits         # treat Q4_K_M ~4.625 bpw as ref
+        speedup_rot  = 4.625 / s_rot.bits
+        print(f"{s_base.fmt:<12}{s_base.bits:>6.2f}{s_base.mse:>14.3e}"
+              f"{s_base.max_abs_err:>12.3e}{speedup_base:>18.2f}×")
+        print(f"{s_rot.fmt:<12}{s_rot.bits:>6.2f}{s_rot.mse:>14.3e}"
+              f"{s_rot.max_abs_err:>12.3e}{speedup_rot:>18.2f}×")
+    # Find the lowest TQ bit-width whose MSE is still ≤ baseline-Q4 MSE.
+    base_q4_mse = rows[0][0].mse
+    print()
+    for s_base, s_rot in rows:
+        verdict = "✓ matches baseline-Q4 quality" if s_rot.mse <= base_q4_mse else \
+                  "✗ exceeds baseline-Q4 error"
+        print(f"  {s_rot.fmt:<8}  MSE={s_rot.mse:.3e}  {verdict}")
+    print("""
+Interpretation:
+  - Same-bit rotated (TQ-Q4 vs Q4) → quality win, identical decode speed.
+  - Drop-bit rotated (TQ-Q3 vs Q4) → matched quality at ~25% less memory
+    bandwidth → ~10-20% faster decode on memory-bound CPUs (DDR5/8-channel
+    DDR4 incl. Sapphire Rapids when AMX is not the bottleneck).
+""")
+if __name__ == "__main__":
+    run_bench()

hadamard.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Walsh-Hadamard transform helpers.
+We use:
+  - hadamard_matrix(n)            for arbitrary power-of-2 n
+  - block_hadamard_inplace()      to apply n×n WHT to fixed-size blocks of
+                                  a longer vector / row of a matrix
+"""
+from __future__ import annotations
+import math
+import numpy as np
+import torch
+def hadamard_matrix(n: int, dtype=torch.float32) -> torch.Tensor:
+    """Return the n×n NORMALIZED Walsh-Hadamard matrix.
+    H is its own inverse (H H = I) so quantization rotations cancel under
+    H @ Wᵀ ··· W @ Hᵀ ≡ identity. n must be a power of 2.
+    """
+    if n <= 0 or (n & (n - 1)) != 0:
+        raise ValueError(f"n must be a positive power of 2, got {n}")
+    H = torch.tensor([[1.0]], dtype=dtype)
+    while H.shape[0] < n:
+        H = torch.cat(
+            [torch.cat([H, H], dim=1), torch.cat([H, -H], dim=1)],
+            dim=0,
+        )
+    return H / math.sqrt(n)
+def block_hadamard_inplace(W: torch.Tensor, axis: int = -1, block: int = 128) -> None:
+    """Apply n×n Hadamard to every contiguous `block`-sized slice along `axis`.
+    Used when the full dim isn't a power of 2 (e.g. ffn_dim=11008). Block
+    size 128 fits comfortably in L1 and is a power of 2.
+    """
+    n = W.shape[axis]
+    if n % block != 0:
+        raise ValueError(f"axis dim {n} not divisible by block {block}")
+    H = hadamard_matrix(block, dtype=W.dtype).to(W.device)
+    # Reshape axis -> (n//block, block), apply H on the last dim, reshape back.
+    moved = W.transpose(axis, -1)            # bring axis to last
+    shape = moved.shape
+    g = shape[-1] // block
+    moved = moved.reshape(*shape[:-1], g, block)
+    moved = moved @ H                        # last-axis matmul; H is symmetric
+    moved = moved.reshape(*shape)
+    out = moved.transpose(axis, -1)
+    W.copy_(out)
+def is_orthogonal(H: torch.Tensor, tol: float = 1e-5) -> bool:
+    """Self-check: H @ Hᵀ ≈ I."""
+    n = H.shape[0]
+    err = (H @ H.t() - torch.eye(n, dtype=H.dtype)).abs().max().item()
+    return err < tol
+# numpy convenience for tooling that doesn't want a torch dep
+def hadamard_matrix_np(n: int) -> np.ndarray:
+    return hadamard_matrix(n, dtype=torch.float64).numpy()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.40
+matplotlib>=3.7
+numpy>=1.24
+torch>=2.0
+pillow>=10.0