Spaces:
Running
Running
| """TurboCPP β llama.cpp + TurboQuant β HuggingFace Space. | |
| Two tabs: | |
| 1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the | |
| prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels. | |
| 2. TurboQuant math viz: shows what offline rotation does to the weight | |
| distribution that quantization sees. | |
| Background-loaded model + named API endpoint so curl / requests / the | |
| Gradio Python client all work. | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import os | |
| import threading | |
| import time | |
| import gradio as gr | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| from hadamard import block_hadamard_inplace | |
| from bench import heavy_tailed_weight, measure | |
| MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" | |
| MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" | |
| # βββ Background model loader βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Loading the GGUF is slow (~60s download + ~5s mmap on a free Space) and | |
| # the first chat() call would otherwise time out the Gradio request. We | |
| # kick the load on a daemon thread at import time and pin the result in | |
| # module globals; chat() blocks briefly on that. | |
| _llm = None | |
| _llm_error: str | None = None | |
| _llm_status = "loading: starting up" | |
| def _load_model(): | |
| global _llm, _llm_error, _llm_status | |
| try: | |
| _llm_status = f"loading: downloading {MODEL_FILE}" | |
| from huggingface_hub import hf_hub_download | |
| path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE, | |
| cache_dir=os.environ.get("HF_HOME", "/tmp/hf"), | |
| ) | |
| _llm_status = "loading: instantiating llama-cpp" | |
| from llama_cpp import Llama | |
| llm = Llama( | |
| model_path=path, | |
| n_ctx=2048, | |
| n_threads=int(os.environ.get("LLAMA_THREADS", "2")), | |
| n_batch=64, | |
| verbose=False, | |
| ) | |
| _llm = llm | |
| _llm_status = "ready" | |
| except Exception as e: | |
| _llm_error = f"{type(e).__name__}: {e}" | |
| _llm_status = f"failed: {_llm_error}" | |
| threading.Thread(target=_load_model, daemon=True).start() | |
| def _await_llm(timeout: float = 240.0): | |
| """Block until model is loaded (or fail). Yields a meaningful error | |
| string if loading failed.""" | |
| t0 = time.time() | |
| while _llm is None and _llm_error is None: | |
| if time.time() - t0 > timeout: | |
| raise RuntimeError(f"timeout after {timeout:.0f}s; status: {_llm_status}") | |
| time.sleep(0.5) | |
| if _llm_error: | |
| raise RuntimeError(_llm_error) | |
| return _llm | |
| # βββ Inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chat(prompt: str, max_tokens: int, temperature: float): | |
| """Return (output, stats markdown). Wrapped error path keeps the UI | |
| responsive even when the model is mid-download.""" | |
| try: | |
| llm = _await_llm() | |
| except RuntimeError as e: | |
| return f"β³ model not ready: {e}", f"status: **{_llm_status}**" | |
| formatted = ( | |
| f"<|system|>\nYou are a concise assistant.</s>\n" | |
| f"<|user|>\n{prompt}</s>\n" | |
| f"<|assistant|>\n" | |
| ) | |
| t0 = time.time() | |
| out = llm( | |
| formatted, | |
| max_tokens=int(max_tokens), | |
| temperature=float(temperature), | |
| top_p=0.95, | |
| stop=["</s>", "<|user|>"], | |
| echo=False, | |
| ) | |
| dt = time.time() - t0 | |
| text = out["choices"][0]["text"].strip() or "(empty)" | |
| n = out["usage"]["completion_tokens"] | |
| stats = ( | |
| f"**{n} tokens** in **{dt:.2f}s** β **{n/max(dt,1e-3):.1f} tok/s**\n\n" | |
| f"Q4_K_M baseline. With TurboQuant rotation you can drop to Q3_K_M " | |
| f"at similar quality and pick up ~25% more tok/s on the same hardware " | |
| f"(math in the next tab)." | |
| ) | |
| return text, stats | |
| def status_check(): | |
| return f"model status: **{_llm_status}**" | |
| # βββ Visualization βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _plot(W_raw, W_rot, block): | |
| fig, axes = plt.subplots(1, 3, figsize=(13, 3.6)) | |
| bins = np.linspace(-0.5, 0.5, 121) | |
| axes[0].hist(W_raw.flatten().numpy(), bins=bins, color="#888", alpha=0.85) | |
| axes[0].set_title("raw weights β heavy-tailed") | |
| axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log") | |
| axes[1].hist(W_rot.flatten().numpy(), bins=bins, color="#3B82F6", alpha=0.85) | |
| axes[1].set_title("after block-Hadamard β Gaussianized") | |
| axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log") | |
| raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy() | |
| rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy() | |
| axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw", color="#888") | |
| axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6") | |
| axes[2].set_title(f"per-{block} block max|w|") | |
| axes[2].legend() | |
| fig.tight_layout() | |
| buf = io.BytesIO() | |
| fig.savefig(buf, format="png", dpi=110) | |
| plt.close(fig) | |
| buf.seek(0) | |
| return Image.open(buf) | |
| def visualize(rows, cols, block, seed): | |
| W = heavy_tailed_weight(int(rows), int(cols), int(seed)) | |
| W_rot = W.clone().double() | |
| block_hadamard_inplace(W_rot, axis=-1, block=int(block)) | |
| lines = [] | |
| for bits in (4, 3, 2): | |
| s_base = measure(W, bits=bits, rotated=False, block=int(block)) | |
| s_rot = measure(W, bits=bits, rotated=True, block=int(block)) | |
| lines.append( | |
| f"Q{bits} raw MSE = {s_base.mse:.3e} " | |
| f"TQ MSE = {s_rot.mse:.3e} " | |
| f"Γ {s_base.mse/max(s_rot.mse,1e-30):.1f} better" | |
| ) | |
| summary = ( | |
| f"weight = {rows} Γ {cols}, block = {block}\n" | |
| f"per-block max|w| raw mean = " | |
| f"{W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n" | |
| f"per-block max|w| rot mean = " | |
| f"{W_rot.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n\n" | |
| + "\n".join(lines) | |
| ) | |
| return _plot(W, W_rot, int(block)), summary | |
| # βββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="turbocpp β llama.cpp + TurboQuant") as demo: | |
| gr.Markdown("# turbocpp β llama.cpp + TurboQuant") | |
| gr.Markdown( | |
| "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a " | |
| "prebuilt wheel + interactive Hadamard-rotation visualizer.\n\n" | |
| "**Code:** [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp) Β· " | |
| "**Wheel:** `pip install turbocpp` from " | |
| "[the dataset mirror](https://huggingface.co/datasets/AIencoder/llama-cpp-wheels) Β· " | |
| "**Docker:** `ghcr.io/ary5272/turbocpp:cpu`" | |
| ) | |
| with gr.Tab("Run inference"): | |
| gr.Markdown( | |
| "Live llama.cpp inference on TinyLlama-1.1B-Chat at Q4_K_M, " | |
| "loaded via `llama-cpp-python` on this Space's CPU.\n\n" | |
| "*First call may wait ~60s while the GGUF downloads.*" | |
| ) | |
| status_md = gr.Markdown(f"model status: **{_llm_status}**") | |
| prompt_in = gr.Textbox( | |
| value="Explain quantization in one paragraph.", | |
| label="prompt", lines=3, | |
| ) | |
| with gr.Row(): | |
| max_t = gr.Slider(8, 256, value=96, step=8, label="max new tokens") | |
| temp = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature") | |
| with gr.Row(): | |
| run_btn = gr.Button("generate", variant="primary") | |
| status_btn = gr.Button("refresh model status", variant="secondary") | |
| out_box = gr.Textbox(label="output", lines=10) | |
| stats_box = gr.Markdown() | |
| run_btn.click( | |
| chat, | |
| inputs=[prompt_in, max_t, temp], | |
| outputs=[out_box, stats_box], | |
| api_name="generate", # β exposed as named API endpoint | |
| ) | |
| status_btn.click(status_check, outputs=status_md, api_name="status") | |
| with gr.Tab("TurboQuant math viz"): | |
| gr.Markdown( | |
| "Drag the sliders to see how a Walsh-Hadamard rotation " | |
| "reshapes a synthetic LLM-style weight distribution. The " | |
| "rotation is orthogonal β fp32 model output is unchanged β " | |
| "but per-block max-abs drops 3-5Γ β much smaller Q4 / Q4_K " | |
| "rounding error." | |
| ) | |
| with gr.Row(): | |
| rows = gr.Slider(64, 4096, value=1024, step=64, label="rows") | |
| cols = gr.Slider(64, 4096, value=4096, step=64, label="cols") | |
| block = gr.Slider(32, 256, value=128, step=32, label="block size") | |
| seed = gr.Slider(0, 1000, value=0, step=1, label="seed") | |
| viz_btn = gr.Button("visualize", variant="primary") | |
| img_out = gr.Image(type="pil", label="distributions") | |
| rep_out = gr.Textbox(label="quant-error report", lines=8) | |
| viz_btn.click( | |
| visualize, | |
| inputs=[rows, cols, block, seed], | |
| outputs=[img_out, rep_out], | |
| api_name="visualize", | |
| ) | |
| demo.load(visualize, [rows, cols, block, seed], [img_out, rep_out]) | |
| gr.Markdown( | |
| "---\n" | |
| "### Use the API\n" | |
| "**Python (recommended):**\n" | |
| "```python\n" | |
| "from gradio_client import Client\n" | |
| "c = Client('AIencoder/turboquant-visualizer')\n" | |
| "out, stats = c.predict(prompt='Hi', max_tokens=64, temperature=0.7,\n" | |
| " api_name='/generate')\n" | |
| "```\n" | |
| "**Raw HTTP** (Gradio 5 uses the `/gradio_api/call/` prefix):\n" | |
| "```bash\n" | |
| "curl -X POST https://aiencoder-turboquant-visualizer.hf.space/gradio_api/call/generate \\\n" | |
| " -H 'content-type: application/json' \\\n" | |
| " -d '{\"data\":[\"Hi\", 64, 0.7]}'\n" | |
| "# returns an EVENT_ID; then GET /gradio_api/call/generate/<EVENT_ID> for the result\n" | |
| "```\n" | |
| "Endpoints: `/generate`, `/visualize`, `/status`. " | |
| "Full machine-readable spec at " | |
| "[`/gradio_api/info`](https://aiencoder-turboquant-visualizer.hf.space/gradio_api/info)." | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=8).launch() | |