AIencoder commited on
Commit
a61623a
Β·
verified Β·
1 Parent(s): 48cba22

v8: api_name= on click handlers + background model preload + status indicator

Browse files
Files changed (1) hide show
  1. app.py +100 -56
app.py CHANGED
@@ -3,13 +3,17 @@
3
  Two tabs:
4
  1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
5
  prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
6
- 2. TurboQuant math viz: shows what the offline rotation does to the
7
- weight distribution that quantization sees.
 
 
 
8
  """
9
  from __future__ import annotations
10
 
11
  import io
12
  import os
 
13
  import time
14
 
15
  import gradio as gr
@@ -24,44 +28,70 @@ from hadamard import block_hadamard_inplace
24
  from bench import heavy_tailed_weight, measure
25
 
26
 
27
- _llm = None
28
- _load_error = None
29
-
30
  MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
31
  MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
32
 
 
 
 
 
 
 
 
 
33
 
34
- def _ensure_llm():
35
- global _llm, _load_error
36
- if _llm is not None:
37
- return _llm, None
38
- if _load_error is not None:
39
- return None, _load_error
40
  try:
 
41
  from huggingface_hub import hf_hub_download
42
- from llama_cpp import Llama
43
  path = hf_hub_download(
44
  repo_id=MODEL_REPO,
45
  filename=MODEL_FILE,
46
  cache_dir=os.environ.get("HF_HOME", "/tmp/hf"),
47
  )
48
- _llm = Llama(
 
 
49
  model_path=path,
50
  n_ctx=2048,
51
  n_threads=int(os.environ.get("LLAMA_THREADS", "2")),
52
  n_batch=64,
53
  verbose=False,
54
  )
55
- return _llm, None
 
56
  except Exception as e:
57
- _load_error = f"failed to load model: {e}"
58
- return None, _load_error
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def chat(prompt: str, max_tokens: int, temperature: float):
62
- llm, err = _ensure_llm()
63
- if err:
64
- return f"Loading error: {err}", ""
 
 
 
 
65
  formatted = (
66
  f"<|system|>\nYou are a concise assistant.</s>\n"
67
  f"<|user|>\n{prompt}</s>\n"
@@ -77,39 +107,37 @@ def chat(prompt: str, max_tokens: int, temperature: float):
77
  echo=False,
78
  )
79
  dt = time.time() - t0
80
- text = out["choices"][0]["text"].strip()
81
  n = out["usage"]["completion_tokens"]
82
- tps = n / max(dt, 1e-3)
83
  stats = (
84
- f"**{n} tokens** in **{dt:.2f}s** -> **{tps:.1f} tok/s**\n\n"
85
- f"This is baseline Q4_K_M. With TurboQuant rotation you can drop "
86
- f"to Q3_K_M at similar quality and pick up ~25% more tok/s on the "
87
- f"same hardware (math in the next tab)."
88
  )
89
- return text or "(empty)", stats
 
 
 
 
90
 
91
 
 
92
  def _plot(W_raw, W_rot, block):
93
  fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
94
- raw = W_raw.flatten().numpy()
95
- rot = W_rot.flatten().numpy()
96
-
97
  bins = np.linspace(-0.5, 0.5, 121)
98
- axes[0].hist(raw, bins=bins, color="#888", alpha=0.85)
99
- axes[0].set_title("raw weights - heavy-tailed")
100
  axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log")
101
-
102
- axes[1].hist(rot, bins=bins, color="#3B82F6", alpha=0.85)
103
- axes[1].set_title("after block-Hadamard - Gaussianized")
104
  axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log")
105
-
106
  raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy()
107
  rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy()
108
  axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw", color="#888")
109
  axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6")
110
  axes[2].set_title(f"per-{block} block max|w|")
111
  axes[2].legend()
112
-
113
  fig.tight_layout()
114
  buf = io.BytesIO()
115
  fig.savefig(buf, format="png", dpi=110)
@@ -122,7 +150,6 @@ def visualize(rows, cols, block, seed):
122
  W = heavy_tailed_weight(int(rows), int(cols), int(seed))
123
  W_rot = W.clone().double()
124
  block_hadamard_inplace(W_rot, axis=-1, block=int(block))
125
-
126
  lines = []
127
  for bits in (4, 3, 2):
128
  s_base = measure(W, bits=bits, rotated=False, block=int(block))
@@ -130,11 +157,10 @@ def visualize(rows, cols, block, seed):
130
  lines.append(
131
  f"Q{bits} raw MSE = {s_base.mse:.3e} "
132
  f"TQ MSE = {s_rot.mse:.3e} "
133
- f"x {s_base.mse/max(s_rot.mse,1e-30):.1f} better"
134
  )
135
-
136
  summary = (
137
- f"weight = {rows} x {cols}, block = {block}\n"
138
  f"per-block max|w| raw mean = "
139
  f"{W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n"
140
  f"per-block max|w| rot mean = "
@@ -144,20 +170,25 @@ def visualize(rows, cols, block, seed):
144
  return _plot(W, W_rot, int(block)), summary
145
 
146
 
147
- with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
148
- theme=gr.themes.Soft()) as demo:
149
- gr.Markdown("# turbocpp - llama.cpp + TurboQuant")
150
  gr.Markdown(
151
  "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
152
- "prebuilt wheel + interactive Hadamard-rotation visualizer. "
153
- "Code: [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp)"
 
 
 
154
  )
155
 
156
  with gr.Tab("Run inference"):
157
  gr.Markdown(
158
  "Live llama.cpp inference on TinyLlama-1.1B-Chat at Q4_K_M, "
159
- "loaded via `llama-cpp-python` on this Space's CPU."
 
160
  )
 
161
  prompt_in = gr.Textbox(
162
  value="Explain quantization in one paragraph.",
163
  label="prompt", lines=3,
@@ -165,17 +196,25 @@ with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
165
  with gr.Row():
166
  max_t = gr.Slider(8, 256, value=96, step=8, label="max new tokens")
167
  temp = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature")
168
- run_btn = gr.Button("generate", variant="primary")
 
 
169
  out_box = gr.Textbox(label="output", lines=10)
170
  stats_box = gr.Markdown()
171
- run_btn.click(chat, [prompt_in, max_t, temp], [out_box, stats_box])
 
 
 
 
 
 
172
 
173
  with gr.Tab("TurboQuant math viz"):
174
  gr.Markdown(
175
  "Drag the sliders to see how a Walsh-Hadamard rotation "
176
  "reshapes a synthetic LLM-style weight distribution. The "
177
- "rotation is orthogonal - fp32 model output is unchanged - "
178
- "but per-block max-abs drops 3-5x -> much smaller Q4 / Q4_K "
179
  "rounding error."
180
  )
181
  with gr.Row():
@@ -183,19 +222,24 @@ with gr.Blocks(title="turbocpp - llama.cpp + TurboQuant",
183
  cols = gr.Slider(64, 4096, value=4096, step=64, label="cols")
184
  block = gr.Slider(32, 256, value=128, step=32, label="block size")
185
  seed = gr.Slider(0, 1000, value=0, step=1, label="seed")
186
- viz_btn = gr.Button("visualize")
187
  img_out = gr.Image(type="pil", label="distributions")
188
  rep_out = gr.Textbox(label="quant-error report", lines=8)
189
- viz_btn.click(visualize, [rows, cols, block, seed], [img_out, rep_out])
 
 
 
 
 
190
  demo.load(visualize, [rows, cols, block, seed], [img_out, rep_out])
191
 
192
  gr.Markdown(
193
  "---\n"
194
- "Want the actual A/B speed numbers? Clone the repo and run "
195
- "`scripts/bench_e2e.sh /path/to/HF/Llama-3-8B`, or pull the Docker "
196
- "image: `docker pull ghcr.io/ary5272/turbocpp:turboquant`."
197
  )
198
 
199
 
200
  if __name__ == "__main__":
201
- demo.launch()
 
3
  Two tabs:
4
  1. Run inference: live llama.cpp on TinyLlama-1.1B-Chat-Q4_K_M via the
5
  prebuilt llama-cpp-python wheel from AIencoder/llama-cpp-wheels.
6
+ 2. TurboQuant math viz: shows what offline rotation does to the weight
7
+ distribution that quantization sees.
8
+
9
+ Background-loaded model + named API endpoint so curl / requests / the
10
+ Gradio Python client all work.
11
  """
12
  from __future__ import annotations
13
 
14
  import io
15
  import os
16
+ import threading
17
  import time
18
 
19
  import gradio as gr
 
28
  from bench import heavy_tailed_weight, measure
29
 
30
 
 
 
 
31
  MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
32
  MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
33
 
34
+ # ─── Background model loader ─────────────────────────────────────────────────
35
+ # Loading the GGUF is slow (~60s download + ~5s mmap on a free Space) and
36
+ # the first chat() call would otherwise time out the Gradio request. We
37
+ # kick the load on a daemon thread at import time and pin the result in
38
+ # module globals; chat() blocks briefly on that.
39
+ _llm = None
40
+ _llm_error: str | None = None
41
+ _llm_status = "loading: starting up"
42
 
43
+
44
+ def _load_model():
45
+ global _llm, _llm_error, _llm_status
 
 
 
46
  try:
47
+ _llm_status = f"loading: downloading {MODEL_FILE}"
48
  from huggingface_hub import hf_hub_download
 
49
  path = hf_hub_download(
50
  repo_id=MODEL_REPO,
51
  filename=MODEL_FILE,
52
  cache_dir=os.environ.get("HF_HOME", "/tmp/hf"),
53
  )
54
+ _llm_status = "loading: instantiating llama-cpp"
55
+ from llama_cpp import Llama
56
+ llm = Llama(
57
  model_path=path,
58
  n_ctx=2048,
59
  n_threads=int(os.environ.get("LLAMA_THREADS", "2")),
60
  n_batch=64,
61
  verbose=False,
62
  )
63
+ _llm = llm
64
+ _llm_status = "ready"
65
  except Exception as e:
66
+ _llm_error = f"{type(e).__name__}: {e}"
67
+ _llm_status = f"failed: {_llm_error}"
68
 
69
 
70
+ threading.Thread(target=_load_model, daemon=True).start()
71
+
72
+
73
+ def _await_llm(timeout: float = 240.0):
74
+ """Block until model is loaded (or fail). Yields a meaningful error
75
+ string if loading failed."""
76
+ t0 = time.time()
77
+ while _llm is None and _llm_error is None:
78
+ if time.time() - t0 > timeout:
79
+ raise RuntimeError(f"timeout after {timeout:.0f}s; status: {_llm_status}")
80
+ time.sleep(0.5)
81
+ if _llm_error:
82
+ raise RuntimeError(_llm_error)
83
+ return _llm
84
+
85
+
86
+ # ─── Inference ───────────────────────────────────────────────────────────────
87
  def chat(prompt: str, max_tokens: int, temperature: float):
88
+ """Return (output, stats markdown). Wrapped error path keeps the UI
89
+ responsive even when the model is mid-download."""
90
+ try:
91
+ llm = _await_llm()
92
+ except RuntimeError as e:
93
+ return f"⏳ model not ready: {e}", f"status: **{_llm_status}**"
94
+
95
  formatted = (
96
  f"<|system|>\nYou are a concise assistant.</s>\n"
97
  f"<|user|>\n{prompt}</s>\n"
 
107
  echo=False,
108
  )
109
  dt = time.time() - t0
110
+ text = out["choices"][0]["text"].strip() or "(empty)"
111
  n = out["usage"]["completion_tokens"]
 
112
  stats = (
113
+ f"**{n} tokens** in **{dt:.2f}s** β†’ **{n/max(dt,1e-3):.1f} tok/s**\n\n"
114
+ f"Q4_K_M baseline. With TurboQuant rotation you can drop to Q3_K_M "
115
+ f"at similar quality and pick up ~25% more tok/s on the same hardware "
116
+ f"(math in the next tab)."
117
  )
118
+ return text, stats
119
+
120
+
121
+ def status_check():
122
+ return f"model status: **{_llm_status}**"
123
 
124
 
125
+ # ─── Visualization ───────────────────────────────────────────────────────────
126
  def _plot(W_raw, W_rot, block):
127
  fig, axes = plt.subplots(1, 3, figsize=(13, 3.6))
 
 
 
128
  bins = np.linspace(-0.5, 0.5, 121)
129
+ axes[0].hist(W_raw.flatten().numpy(), bins=bins, color="#888", alpha=0.85)
130
+ axes[0].set_title("raw weights β€” heavy-tailed")
131
  axes[0].set_xlim(-0.5, 0.5); axes[0].set_yscale("log")
132
+ axes[1].hist(W_rot.flatten().numpy(), bins=bins, color="#3B82F6", alpha=0.85)
133
+ axes[1].set_title("after block-Hadamard β€” Gaussianized")
 
134
  axes[1].set_xlim(-0.5, 0.5); axes[1].set_yscale("log")
 
135
  raw_blkmax = W_raw.reshape(-1, block).abs().amax(dim=-1).numpy()
136
  rot_blkmax = W_rot.reshape(-1, block).abs().amax(dim=-1).numpy()
137
  axes[2].hist(raw_blkmax, bins=40, alpha=0.6, label="raw", color="#888")
138
  axes[2].hist(rot_blkmax, bins=40, alpha=0.6, label="rotated", color="#3B82F6")
139
  axes[2].set_title(f"per-{block} block max|w|")
140
  axes[2].legend()
 
141
  fig.tight_layout()
142
  buf = io.BytesIO()
143
  fig.savefig(buf, format="png", dpi=110)
 
150
  W = heavy_tailed_weight(int(rows), int(cols), int(seed))
151
  W_rot = W.clone().double()
152
  block_hadamard_inplace(W_rot, axis=-1, block=int(block))
 
153
  lines = []
154
  for bits in (4, 3, 2):
155
  s_base = measure(W, bits=bits, rotated=False, block=int(block))
 
157
  lines.append(
158
  f"Q{bits} raw MSE = {s_base.mse:.3e} "
159
  f"TQ MSE = {s_rot.mse:.3e} "
160
+ f"Γ— {s_base.mse/max(s_rot.mse,1e-30):.1f} better"
161
  )
 
162
  summary = (
163
+ f"weight = {rows} Γ— {cols}, block = {block}\n"
164
  f"per-block max|w| raw mean = "
165
  f"{W.reshape(-1, int(block)).abs().amax(dim=-1).mean():.3f}\n"
166
  f"per-block max|w| rot mean = "
 
170
  return _plot(W, W_rot, int(block)), summary
171
 
172
 
173
+ # ─── UI ──────────────────────────────────────────────────────────────────────
174
+ with gr.Blocks(title="turbocpp β€” llama.cpp + TurboQuant") as demo:
175
+ gr.Markdown("# turbocpp β€” llama.cpp + TurboQuant")
176
  gr.Markdown(
177
  "Live llama.cpp running TinyLlama-1.1B-Chat (Q4_K_M) via a "
178
+ "prebuilt wheel + interactive Hadamard-rotation visualizer.\n\n"
179
+ "**Code:** [github.com/Ary5272/turbocpp](https://github.com/Ary5272/turbocpp) Β· "
180
+ "**Wheel:** `pip install turbocpp` from "
181
+ "[the dataset mirror](https://huggingface.co/datasets/AIencoder/llama-cpp-wheels) Β· "
182
+ "**Docker:** `ghcr.io/ary5272/turbocpp:cpu`"
183
  )
184
 
185
  with gr.Tab("Run inference"):
186
  gr.Markdown(
187
  "Live llama.cpp inference on TinyLlama-1.1B-Chat at Q4_K_M, "
188
+ "loaded via `llama-cpp-python` on this Space's CPU.\n\n"
189
+ "*First call may wait ~60s while the GGUF downloads.*"
190
  )
191
+ status_md = gr.Markdown(f"model status: **{_llm_status}**")
192
  prompt_in = gr.Textbox(
193
  value="Explain quantization in one paragraph.",
194
  label="prompt", lines=3,
 
196
  with gr.Row():
197
  max_t = gr.Slider(8, 256, value=96, step=8, label="max new tokens")
198
  temp = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature")
199
+ with gr.Row():
200
+ run_btn = gr.Button("generate", variant="primary")
201
+ status_btn = gr.Button("refresh model status", variant="secondary")
202
  out_box = gr.Textbox(label="output", lines=10)
203
  stats_box = gr.Markdown()
204
+ run_btn.click(
205
+ chat,
206
+ inputs=[prompt_in, max_t, temp],
207
+ outputs=[out_box, stats_box],
208
+ api_name="generate", # ← exposed as named API endpoint
209
+ )
210
+ status_btn.click(status_check, outputs=status_md, api_name="status")
211
 
212
  with gr.Tab("TurboQuant math viz"):
213
  gr.Markdown(
214
  "Drag the sliders to see how a Walsh-Hadamard rotation "
215
  "reshapes a synthetic LLM-style weight distribution. The "
216
+ "rotation is orthogonal β€” fp32 model output is unchanged β€” "
217
+ "but per-block max-abs drops 3-5Γ— β†’ much smaller Q4 / Q4_K "
218
  "rounding error."
219
  )
220
  with gr.Row():
 
222
  cols = gr.Slider(64, 4096, value=4096, step=64, label="cols")
223
  block = gr.Slider(32, 256, value=128, step=32, label="block size")
224
  seed = gr.Slider(0, 1000, value=0, step=1, label="seed")
225
+ viz_btn = gr.Button("visualize", variant="primary")
226
  img_out = gr.Image(type="pil", label="distributions")
227
  rep_out = gr.Textbox(label="quant-error report", lines=8)
228
+ viz_btn.click(
229
+ visualize,
230
+ inputs=[rows, cols, block, seed],
231
+ outputs=[img_out, rep_out],
232
+ api_name="visualize",
233
+ )
234
  demo.load(visualize, [rows, cols, block, seed], [img_out, rep_out])
235
 
236
  gr.Markdown(
237
  "---\n"
238
+ "**API**: every button is a named endpoint β€” POST to "
239
+ "`/api/generate`, `/api/visualize`, `/api/status`. Or use "
240
+ "`gradio_client.Client('AIencoder/turboquant-visualizer').predict(...)`."
241
  )
242
 
243
 
244
  if __name__ == "__main__":
245
+ demo.queue(max_size=8).launch()