Spaces:

AIencoder
/

turboquant-visualizer

Sleeping

App Files Files Community

AIencoder commited on 15 days ago

Commit

a61623a

verified ·

1 Parent(s): 48cba22

v8: api_name= on click handlers + background model preload + status indicator

Browse files

Files changed (1) hide show

app.py +100 -56

app.py CHANGED Viewed

@@ -3,13 +3,17 @@
 Two tabs:
   1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
      prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
-  2. TurboQuant math viz: shows what the offline rotation does to the
-     weight distribution that quantization sees.
 """
 from __future__ import annotations
 import io
 import os
 import time
 import gradio as gr
@@ -24,44 +28,70 @@ from hadamard import block_hadamard_inplace
 from bench import heavy_tailed_weight, measure
-_llm = None
-_load_error = None
 MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
-def _ensure_llm():
-    global _llm, _load_error
-    if _llm is not None:
-        return _llm, None
-    if _load_error is not None:
-        return None, _load_error
     try:
         from huggingface_hub import hf_hub_download
-        from llama_cpp import Llama
         path = hf_hub_download(
             repo_id=MODEL_REPO,
             filename=MODEL_FILE,
             cache_dir=os.environ.get("HF_HOME", "/tmp/hf"),
         )
-        _llm = Llama(
             model_path=path,
             n_ctx=2048,
             n_threads=int(os.environ.get("LLAMA_THREADS", "2")),
             n_batch=64,
             verbose=False,
         )
-        return _llm, None
     except Exception as e:
-        _load_error = f"failed to load model: {e}"
-        return None, _load_error
 def chat(prompt: str, max_tokens: int, temperature: float):
-    llm, err = _ensure_llm()
-    if err:
-        return f"Loading error: {err}", ""
     formatted = (
         f"<|system|>\nYou are a concise assistant.</s>\n"
         f"<|user|>\n{prompt}</s>\n"
@@ -77,39 +107,37 @@ def chat(prompt: str, max_tokens: int, temperature: float):
         echo=False,
     )
     dt = time.time() - t0
-    text = out["choices"][0]["text"].strip()
     n = out["usage"]["completion_tokens"]
-    tps = n / max(dt, 1e-3)
     stats = (
-        f"**{n} tokens** in **{dt:.2f}s** -> **{tps:.1f} tok/s**\n\n"
-        f"This is baseline Q4_K_M. With TurboQuant rotation you can drop "
-        f"to Q3_K_M at similar quality and pick up ~25% more tok/s on the "
-        f"same hardware (math in the next tab)."
     )
-    return text or "(empty)", stats
 def _plot(W_raw, W_rot, block):
     fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
-    raw = W_raw.flatten().numpy()
-    rot = W_rot.flatten().numpy()
     bins = np.linspace(-0.5, 0.5, 121)
-    axes[0].hist(raw, bins=bins, color="#888", alpha=0.85)
-    axes[0].set_title("raw weights - heavy-tailed")
     axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log")
-    axes[1].hist(rot, bins=bins, color="#3B82F6", alpha=0.85)
-    axes[1].set_title("after block-Hadamard - Gaussianized")
     axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log")
     raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy()
     rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy()
     axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw",     color="#888")
     axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6")
     axes[2].set_title(f"per-{block} block max|w|")
     axes[2].legend()
     fig.tight_layout()
     buf = io.BytesIO()
     fig.savefig(buf, format="png", dpi=110)
@@ -122,7 +150,6 @@ def visualize(rows, cols, block, seed):
     W = heavy_tailed_weight(int(rows), int(cols), int(seed))
     W_rot = W.clone().double()
     block_hadamard_inplace(W_rot, axis=-1, block=int(block))
     lines = []
     for bits in (4, 3, 2):
         s_base = measure(W, bits=bits, rotated=False, block=int(block))
@@ -130,11 +157,10 @@ def visualize(rows, cols, block, seed):
         lines.append(
             f"Q{bits}      raw MSE = {s_base.mse:.3e}    "
             f"TQ MSE = {s_rot.mse:.3e}    "
-            f"x {s_base.mse/max(s_rot.mse,1e-30):.1f} better"
         )
     summary = (
-        f"weight = {rows} x {cols}, block = {block}\n"
         f"per-block max|w|   raw mean = "
         f"{W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n"
         f"per-block max|w|   rot mean = "
@@ -144,20 +170,25 @@ def visualize(rows, cols, block, seed):
     return _plot(W, W_rot, int(block)), summary
-with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
-               theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
     gr.Markdown(
         "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
-        "prebuilt wheel + interactive Hadamard-rotation visualizer. "
-        "Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
     )
     with gr.Tab("Run inference"):
         gr.Markdown(
             "Live llama.cpp inference on TinyLlama-1.1B-Chat at Q4_K_M, "
-            "loaded via `llama-cpp-python` on this Space's CPU."
         )
         prompt_in = gr.Textbox(
             value="Explain quantization in one paragraph.",
             label="prompt", lines=3,
@@ -165,17 +196,25 @@ with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
         with gr.Row():
             max_t = gr.Slider(8, 256, value=96, step=8, label="max new tokens")
             temp  = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature")
-        run_btn = gr.Button("generate", variant="primary")
         out_box = gr.Textbox(label="output", lines=10)
         stats_box = gr.Markdown()
-        run_btn.click(chat, [prompt_in, max_t, temp], [out_box, stats_box])
     with gr.Tab("TurboQuant math viz"):
         gr.Markdown(
             "Drag the sliders to see how a Walsh-Hadamard rotation "
             "reshapes a synthetic LLM-style weight distribution. The "
-            "rotation is orthogonal - fp32 model output is unchanged - "
-            "but per-block max-abs drops 3-5x -> much smaller Q4 / Q4_K "
             "rounding error."
         )
         with gr.Row():
@@ -183,19 +222,24 @@ with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
             cols  = gr.Slider(64, 4096, value=4096, step=64, label="cols")
             block = gr.Slider(32,  256, value=128,  step=32, label="block size")
             seed  = gr.Slider(0,  1000, value=0,    step=1,  label="seed")
-        viz_btn = gr.Button("visualize")
         img_out = gr.Image(type="pil", label="distributions")
         rep_out = gr.Textbox(label="quant-error report", lines=8)
-        viz_btn.click(visualize, [rows, cols, block, seed], [img_out, rep_out])
         demo.load(visualize, [rows, cols, block, seed], [img_out, rep_out])
     gr.Markdown(
         "---\n"
-        "Want the actual A/B speed numbers? Clone the repo and run "
-        "`scripts/bench_e2e.sh /path/to/HF/Llama-3-8B`, or pull the Docker "
-        "image: `docker pull ghcr.io/ary5272/turbocpp:turboquant`."
     )
 if __name__ == "__main__":
-    demo.launch()

 Two tabs:
   1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
      prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
+  2. TurboQuant math viz: shows what offline rotation does to the weight
+     distribution that quantization sees.
+Background-loaded model + named API endpoint so curl / requests / the
+Gradio Python client all work.
 """
 from __future__ import annotations
 import io
 import os
+import threading
 import time
 import gradio as gr
 from bench import heavy_tailed_weight, measure
 MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+# ─── Background model loader ─────────────────────────────────────────────────
+# Loading the GGUF is slow (~60s download + ~5s mmap on a free Space) and
+# the first chat() call would otherwise time out the Gradio request. We
+# kick the load on a daemon thread at import time and pin the result in
+# module globals; chat() blocks briefly on that.
+_llm = None
+_llm_error: str | None = None
+_llm_status = "loading: starting up"
+def _load_model():
+    global _llm, _llm_error, _llm_status
     try:
+        _llm_status = f"loading: downloading {MODEL_FILE}"
         from huggingface_hub import hf_hub_download
         path = hf_hub_download(
             repo_id=MODEL_REPO,
             filename=MODEL_FILE,
             cache_dir=os.environ.get("HF_HOME", "/tmp/hf"),
         )
+        _llm_status = "loading: instantiating llama-cpp"
+        from llama_cpp import Llama
+        llm = Llama(
             model_path=path,
             n_ctx=2048,
             n_threads=int(os.environ.get("LLAMA_THREADS", "2")),
             n_batch=64,
             verbose=False,
         )
+        _llm = llm
+        _llm_status = "ready"
     except Exception as e:
+        _llm_error = f"{type(e).__name__}: {e}"
+        _llm_status = f"failed: {_llm_error}"
+threading.Thread(target=_load_model, daemon=True).start()
+def _await_llm(timeout: float = 240.0):
+    """Block until model is loaded (or fail). Yields a meaningful error
+    string if loading failed."""
+    t0 = time.time()
+    while _llm is None and _llm_error is None:
+        if time.time() - t0 > timeout:
+            raise RuntimeError(f"timeout after {timeout:.0f}s; status: {_llm_status}")
+        time.sleep(0.5)
+    if _llm_error:
+        raise RuntimeError(_llm_error)
+    return _llm
+# ─── Inference ───────────────────────────────────────────────────────────────
 def chat(prompt: str, max_tokens: int, temperature: float):
+    """Return (output, stats markdown). Wrapped error path keeps the UI
+    responsive even when the model is mid-download."""
+    try:
+        llm = _await_llm()
+    except RuntimeError as e:
+        return f"⏳ model not ready: {e}", f"status: **{_llm_status}**"
     formatted = (
         f"<|system|>\nYou are a concise assistant.</s>\n"
         f"<|user|>\n{prompt}</s>\n"
         echo=False,
     )
     dt = time.time() - t0
+    text = out["choices"][0]["text"].strip() or "(empty)"
     n = out["usage"]["completion_tokens"]
     stats = (
+        f"**{n} tokens** in **{dt:.2f}s** → **{n/max(dt,1e-3):.1f} tok/s**\n\n"
+        f"Q4_K_M baseline. With TurboQuant rotation you can drop to Q3_K_M "
+        f"at similar quality and pick up ~25% more tok/s on the same hardware "
+        f"(math in the next tab)."
     )
+    return text, stats
+def status_check():
+    return f"model status: **{_llm_status}**"
+# ─── Visualization ───────────────────────────────────────────────────────────
 def _plot(W_raw, W_rot, block):
     fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
     bins = np.linspace(-0.5, 0.5, 121)
+    axes[0].hist(W_raw.flatten().numpy(), bins=bins, color="#888", alpha=0.85)
+    axes[0].set_title("raw weights — heavy-tailed")
     axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log")
+    axes[1].hist(W_rot.flatten().numpy(), bins=bins, color="#3B82F6", alpha=0.85)
+    axes[1].set_title("after block-Hadamard — Gaussianized")
     axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log")
     raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy()
     rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy()
     axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw",     color="#888")
     axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6")
     axes[2].set_title(f"per-{block} block max|w|")
     axes[2].legend()
     fig.tight_layout()
     buf = io.BytesIO()
     fig.savefig(buf, format="png", dpi=110)
     W = heavy_tailed_weight(int(rows), int(cols), int(seed))
     W_rot = W.clone().double()
     block_hadamard_inplace(W_rot, axis=-1, block=int(block))
     lines = []
     for bits in (4, 3, 2):
         s_base = measure(W, bits=bits, rotated=False, block=int(block))
         lines.append(
             f"Q{bits}      raw MSE = {s_base.mse:.3e}    "
             f"TQ MSE = {s_rot.mse:.3e}    "
+            f"× {s_base.mse/max(s_rot.mse,1e-30):.1f} better"
         )
     summary = (
+        f"weight = {rows} × {cols}, block = {block}\n"
         f"per-block max|w|   raw mean = "
         f"{W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n"
         f"per-block max|w|   rot mean = "
     return _plot(W, W_rot, int(block)), summary
+# ─── UI ──────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="turbocpp — llama.cpp + TurboQuant") as demo:
+    gr.Markdown("# turbocpp — llama.cpp + TurboQuant")
     gr.Markdown(
         "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
+        "prebuilt wheel + interactive Hadamard-rotation visualizer.\n\n"
+        "**Code:** [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp) · "
+        "**Wheel:** `pip install turbocpp` from "
+        "[the dataset mirror](https://huggingface.co/datasets/AIencoder/llama-cpp-wheels) · "
+        "**Docker:** `ghcr.io/ary5272/turbocpp:cpu`"
     )
     with gr.Tab("Run inference"):
         gr.Markdown(
             "Live llama.cpp inference on TinyLlama-1.1B-Chat at Q4_K_M, "
+            "loaded via `llama-cpp-python` on this Space's CPU.\n\n"
+            "*First call may wait ~60s while the GGUF downloads.*"
         )
+        status_md = gr.Markdown(f"model status: **{_llm_status}**")
         prompt_in = gr.Textbox(
             value="Explain quantization in one paragraph.",
             label="prompt", lines=3,
         with gr.Row():
             max_t = gr.Slider(8, 256, value=96, step=8, label="max new tokens")
             temp  = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature")
+        with gr.Row():
+            run_btn    = gr.Button("generate", variant="primary")
+            status_btn = gr.Button("refresh model status", variant="secondary")
         out_box = gr.Textbox(label="output", lines=10)
         stats_box = gr.Markdown()
+        run_btn.click(
+            chat,
+            inputs=[prompt_in, max_t, temp],
+            outputs=[out_box, stats_box],
+            api_name="generate",      # ← exposed as named API endpoint
+        )
+        status_btn.click(status_check, outputs=status_md, api_name="status")
     with gr.Tab("TurboQuant math viz"):
         gr.Markdown(
             "Drag the sliders to see how a Walsh-Hadamard rotation "
             "reshapes a synthetic LLM-style weight distribution. The "
+            "rotation is orthogonal — fp32 model output is unchanged — "
+            "but per-block max-abs drops 3-5× → much smaller Q4 / Q4_K "
             "rounding error."
         )
         with gr.Row():
             cols  = gr.Slider(64, 4096, value=4096, step=64, label="cols")
             block = gr.Slider(32,  256, value=128,  step=32, label="block size")
             seed  = gr.Slider(0,  1000, value=0,    step=1,  label="seed")
+        viz_btn = gr.Button("visualize", variant="primary")
         img_out = gr.Image(type="pil", label="distributions")
         rep_out = gr.Textbox(label="quant-error report", lines=8)
+        viz_btn.click(
+            visualize,
+            inputs=[rows, cols, block, seed],
+            outputs=[img_out, rep_out],
+            api_name="visualize",
+        )
         demo.load(visualize, [rows, cols, block, seed], [img_out, rep_out])
     gr.Markdown(
         "---\n"
+        "**API**: every button is a named endpoint — POST to "
+        "`/api/generate`, `/api/visualize`, `/api/status`. Or use "
+        "`gradio_client.Client('AIencoder/turboquant-visualizer').predict(...)`."
     )
 if __name__ == "__main__":
+    demo.queue(max_size=8).launch()