Fix compression-ratio display + switch to repetition-safe default prompt; align with Docker SDK manifest (Dockerfile + requirements.txt + README.md)
Browse files- Dockerfile +17 -1
- README.md +31 -26
- app.py +164 -113
- requirements.txt +6 -1
Dockerfile
CHANGED
|
@@ -1,10 +1,21 @@
|
|
| 1 |
# syntax=docker/dockerfile:1.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
FROM python:3.11-slim
|
| 3 |
|
|
|
|
| 4 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
git \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
|
|
|
| 8 |
RUN useradd -m -u 1000 user
|
| 9 |
USER user
|
| 10 |
ENV HOME=/home/user \
|
|
@@ -15,15 +26,20 @@ ENV HOME=/home/user \
|
|
| 15 |
|
| 16 |
WORKDIR $HOME/app
|
| 17 |
|
|
|
|
| 18 |
COPY --chown=user:user requirements.txt .
|
| 19 |
|
|
|
|
|
|
|
| 20 |
RUN pip install --no-cache-dir --user --upgrade pip && \
|
| 21 |
pip install --no-cache-dir --user \
|
| 22 |
--extra-index-url https://download.pytorch.org/whl/cpu \
|
| 23 |
-r requirements.txt
|
| 24 |
|
|
|
|
| 25 |
COPY --chown=user:user app.py README.md ./
|
| 26 |
|
|
|
|
| 27 |
EXPOSE 7860
|
| 28 |
|
| 29 |
-
CMD ["python", "app.py"]
|
|
|
|
| 1 |
# syntax=docker/dockerfile:1.6
|
| 2 |
+
#
|
| 3 |
+
# HuggingFace Space: kakeyalattice-demo (Docker SDK, Gradio inside)
|
| 4 |
+
#
|
| 5 |
+
# Build notes:
|
| 6 |
+
# - CPU-first so this runs on a free HF Space (no GPU required).
|
| 7 |
+
# - Pulls kakeyalattice + transformers + gradio from PyPI at build
|
| 8 |
+
# time, so the Space is self-contained and reproducible.
|
| 9 |
+
# - App runs Gradio on port 7860 (HF Space default).
|
| 10 |
+
|
| 11 |
FROM python:3.11-slim
|
| 12 |
|
| 13 |
+
# System deps (minimal — Gradio + torch-cpu don't need much)
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 15 |
git \
|
| 16 |
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
|
| 18 |
+
# Create an unprivileged user (HF Spaces expect UID 1000)
|
| 19 |
RUN useradd -m -u 1000 user
|
| 20 |
USER user
|
| 21 |
ENV HOME=/home/user \
|
|
|
|
| 26 |
|
| 27 |
WORKDIR $HOME/app
|
| 28 |
|
| 29 |
+
# Copy requirements first to maximise Docker layer cache
|
| 30 |
COPY --chown=user:user requirements.txt .
|
| 31 |
|
| 32 |
+
# Install CPU-only torch + gradio + our package from PyPI
|
| 33 |
+
# NOTE: --extra-index-url pulls CPU-only torch (smaller image, faster cold start on free tier).
|
| 34 |
RUN pip install --no-cache-dir --user --upgrade pip && \
|
| 35 |
pip install --no-cache-dir --user \
|
| 36 |
--extra-index-url https://download.pytorch.org/whl/cpu \
|
| 37 |
-r requirements.txt
|
| 38 |
|
| 39 |
+
# Copy the app
|
| 40 |
COPY --chown=user:user app.py README.md ./
|
| 41 |
|
| 42 |
+
# HF Space default port
|
| 43 |
EXPOSE 7860
|
| 44 |
|
| 45 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -8,27 +8,27 @@ app_port: 7860
|
|
| 8 |
pinned: false
|
| 9 |
license: apache-2.0
|
| 10 |
---
|
| 11 |
-
|
| 12 |
# KakeyaLattice KV-cache compression demo
|
| 13 |
-
|
| 14 |
Side-by-side comparison of **bf16 DynamicCache** vs **KakeyaLattice E8**
|
| 15 |
compression at three quality levels (Q=10 aggressive, Q=38 balanced,
|
| 16 |
Q=152 near-lossless) on a small HuggingFace causal LM.
|
| 17 |
-
|
| 18 |
Default model: `Qwen/Qwen2-0.5B` (head_dim=64, E8-compatible, runs on
|
| 19 |
free CPU tier). Override `KAKEYA_DEMO_MODEL` env var to use a larger
|
| 20 |
model on a GPU Space.
|
| 21 |
-
|
| 22 |
## How it works
|
| 23 |
-
|
| 24 |
`KakeyaLatticeCache` is a drop-in subclass of `transformers.DynamicCache`
|
| 25 |
that applies a Zamir-Feder nested-lattice codec roundtrip (encode +
|
| 26 |
decode) to every K and V written into the cache.
|
| 27 |
-
|
| 28 |
```python
|
| 29 |
from transformers import AutoModelForCausalLM
|
| 30 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 31 |
-
|
| 32 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")
|
| 33 |
cache = KakeyaLatticeCache(
|
| 34 |
variant="e8", q_range=38,
|
|
@@ -37,28 +37,33 @@ cache = KakeyaLatticeCache(
|
|
| 37 |
)
|
| 38 |
out = model.generate(input_ids, max_new_tokens=200, past_key_values=cache)
|
| 39 |
```
|
| 40 |
-
|
| 41 |
## What you'll see in the demo
|
| 42 |
-
|
| 43 |
For each prompt, the app generates four times:
|
| 44 |
-
|
| 45 |
-
| config
|
| 46 |
-
| --- | --- | --- |
|
| 47 |
-
| bf16 DynamicCache
|
| 48 |
-
| E8 Q=152 near-lossless
|
| 49 |
-
| E8 Q=38 balanced
|
| 50 |
-
| E8 Q=10 aggressive
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
## Caveats
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
| 60 |
## Links
|
| 61 |
-
|
| 62 |
- Package: https://pypi.org/project/kakeyalattice/
|
| 63 |
- Repo: https://github.com/FluffyAIcode/LLM-KV--Cache-compress
|
| 64 |
-
- Paper:
|
|
|
|
|
|
| 8 |
pinned: false
|
| 9 |
license: apache-2.0
|
| 10 |
---
|
| 11 |
+
|
| 12 |
# KakeyaLattice KV-cache compression demo
|
| 13 |
+
|
| 14 |
Side-by-side comparison of **bf16 DynamicCache** vs **KakeyaLattice E8**
|
| 15 |
compression at three quality levels (Q=10 aggressive, Q=38 balanced,
|
| 16 |
Q=152 near-lossless) on a small HuggingFace causal LM.
|
| 17 |
+
|
| 18 |
Default model: `Qwen/Qwen2-0.5B` (head_dim=64, E8-compatible, runs on
|
| 19 |
free CPU tier). Override `KAKEYA_DEMO_MODEL` env var to use a larger
|
| 20 |
model on a GPU Space.
|
| 21 |
+
|
| 22 |
## How it works
|
| 23 |
+
|
| 24 |
`KakeyaLatticeCache` is a drop-in subclass of `transformers.DynamicCache`
|
| 25 |
that applies a Zamir-Feder nested-lattice codec roundtrip (encode +
|
| 26 |
decode) to every K and V written into the cache.
|
| 27 |
+
|
| 28 |
```python
|
| 29 |
from transformers import AutoModelForCausalLM
|
| 30 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 31 |
+
|
| 32 |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")
|
| 33 |
cache = KakeyaLatticeCache(
|
| 34 |
variant="e8", q_range=38,
|
|
|
|
| 37 |
)
|
| 38 |
out = model.generate(input_ids, max_new_tokens=200, past_key_values=cache)
|
| 39 |
```
|
| 40 |
+
|
| 41 |
## What you'll see in the demo
|
| 42 |
+
|
| 43 |
For each prompt, the app generates four times:
|
| 44 |
+
|
| 45 |
+
| config | bits/token | expected quality |
|
| 46 |
+
| ------------------------ | ----------------- | -------------------------------------- |
|
| 47 |
+
| bf16 DynamicCache | 1024 (reference) | identical to reference |
|
| 48 |
+
| E8 Q=152 near-lossless | ~960 (-6%) | essentially identical |
|
| 49 |
+
| E8 Q=38 balanced | ~440 (-57%) | ~1% deviation in ppl |
|
| 50 |
+
| E8 Q=10 aggressive | ~320 (-69%) | noticeably different but coherent |
|
| 51 |
+
|
| 52 |
+
Wall-clock latency per config is also reported.
|
| 53 |
+
|
| 54 |
## Caveats
|
| 55 |
+
|
| 56 |
+
- The cache roundtrips K/V but stores the reconstructed tensor in the
|
| 57 |
+
model's KV dtype. Real HBM bytes saved are **nominal** — the demo's
|
| 58 |
+
value is showing reconstruction quality, not memory savings.
|
| 59 |
+
- Decode is ~1.3-2× slower than bf16 because the codec runs as pure
|
| 60 |
+
PyTorch ops. A fused Triton kernel would close this gap.
|
| 61 |
+
- Head-dim must be a power of 2 and divisible by 4 (D4) or 8 (E8).
|
| 62 |
+
Most modern LLMs satisfy this.
|
| 63 |
+
|
| 64 |
## Links
|
| 65 |
+
|
| 66 |
- Package: https://pypi.org/project/kakeyalattice/
|
| 67 |
- Repo: https://github.com/FluffyAIcode/LLM-KV--Cache-compress
|
| 68 |
+
- Paper: `reports/paper/`
|
| 69 |
+
- DeepSeek-V4-Flash Stage 0.75 findings: `reports/v1_5_release/dsv4_stage075/FINDINGS.md`
|
app.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
"""Gradio Space demo: KakeyaLatticeCache on a small HF causal LM.
|
| 2 |
|
| 3 |
Run locally:
|
| 4 |
-
|
| 5 |
-
|
| 6 |
|
| 7 |
-
Deploy to HF Spaces: see ./
|
| 8 |
-
(head_dim=64, E8-compatible) so it fits on a
|
| 9 |
-
Swap to Qwen/Qwen2.5-1.5B or Llama-3.2-1B (GPU Space)
|
| 10 |
-
decode-length comparisons.
|
| 11 |
|
| 12 |
The demo shows, side-by-side, the same prompt generated under:
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
and reports wall-clock +
|
| 18 |
"""
|
| 19 |
from __future__ import annotations
|
| 20 |
|
|
@@ -26,123 +26,174 @@ import gradio as gr
|
|
| 26 |
import torch
|
| 27 |
|
| 28 |
try:
|
| 29 |
-
|
| 30 |
except ImportError as e:
|
| 31 |
-
|
| 32 |
|
| 33 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 34 |
|
|
|
|
| 35 |
DEFAULT_MODEL = os.environ.get("KAKEYA_DEMO_MODEL", "Qwen/Qwen2-0.5B")
|
|
|
|
| 36 |
_model_cache: dict = {}
|
| 37 |
|
|
|
|
| 38 |
def _load_model(model_id: str, device: str):
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
def _generate_one(
|
| 53 |
-
|
| 54 |
) -> tuple[str, float]:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
| 68 |
|
| 69 |
def run_demo(
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
) -> tuple[str, str, str, str, str]:
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
with gr.Blocks(title="KakeyaLattice KV-cache compression demo") as demo:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
if __name__ == "__main__":
|
| 145 |
-
|
| 146 |
-
server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
|
| 147 |
-
server_port=int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", "7860"))),
|
| 148 |
-
)
|
|
|
|
| 1 |
"""Gradio Space demo: KakeyaLatticeCache on a small HF causal LM.
|
| 2 |
|
| 3 |
Run locally:
|
| 4 |
+
pip install kakeyalattice[hf] gradio
|
| 5 |
+
python app.py
|
| 6 |
|
| 7 |
+
Deploy to HF Spaces: see ./SPACE_README.md and ./HF_SPACE_DEPLOY.md.
|
| 8 |
+
By default uses Qwen2-0.5B (head_dim=64, E8-compatible) so it fits on a
|
| 9 |
+
free HF Space CPU. Swap to Qwen/Qwen2.5-1.5B or Llama-3.2-1B (GPU Space)
|
| 10 |
+
for more interesting decode-length comparisons.
|
| 11 |
|
| 12 |
The demo shows, side-by-side, the same prompt generated under:
|
| 13 |
+
(a) bf16 DynamicCache — reference
|
| 14 |
+
(b) KakeyaLatticeCache E8 Q=10 (aggressive, highest KV compression)
|
| 15 |
+
(c) KakeyaLatticeCache E8 Q=38 (balanced)
|
| 16 |
+
(d) KakeyaLatticeCache E8 Q=152 (near-lossless)
|
| 17 |
+
and reports wall-clock + bits/vec vs bf16 baseline.
|
| 18 |
"""
|
| 19 |
from __future__ import annotations
|
| 20 |
|
|
|
|
| 26 |
import torch
|
| 27 |
|
| 28 |
try:
|
| 29 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
|
| 30 |
except ImportError as e:
|
| 31 |
+
raise ImportError("Install transformers: pip install 'kakeyalattice[hf]'") from e
|
| 32 |
|
| 33 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 34 |
|
| 35 |
+
|
| 36 |
DEFAULT_MODEL = os.environ.get("KAKEYA_DEMO_MODEL", "Qwen/Qwen2-0.5B")
|
| 37 |
+
DEFAULT_PROMPT = "List five countries in Africa:"
|
| 38 |
_model_cache: dict = {}
|
| 39 |
|
| 40 |
+
|
| 41 |
def _load_model(model_id: str, device: str):
|
| 42 |
+
key = (model_id, device)
|
| 43 |
+
if key in _model_cache:
|
| 44 |
+
return _model_cache[key]
|
| 45 |
+
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 46 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 47 |
+
model_id,
|
| 48 |
+
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
|
| 49 |
+
trust_remote_code=True,
|
| 50 |
+
).to(device)
|
| 51 |
+
model.eval()
|
| 52 |
+
_model_cache[key] = (tok, model)
|
| 53 |
+
return tok, model
|
| 54 |
+
|
| 55 |
|
| 56 |
def _generate_one(
|
| 57 |
+
tok, model, prompt: str, max_new: int, cache, device: str,
|
| 58 |
) -> tuple[str, float]:
|
| 59 |
+
ids = tok(prompt, return_tensors="pt").to(device)
|
| 60 |
+
t0 = time.perf_counter()
|
| 61 |
+
with torch.inference_mode():
|
| 62 |
+
out = model.generate(
|
| 63 |
+
**ids,
|
| 64 |
+
max_new_tokens=max_new,
|
| 65 |
+
do_sample=False,
|
| 66 |
+
past_key_values=cache,
|
| 67 |
+
use_cache=True,
|
| 68 |
+
)
|
| 69 |
+
elapsed = time.perf_counter() - t0
|
| 70 |
+
text = tok.decode(out[0], skip_special_tokens=True)
|
| 71 |
+
return text, elapsed
|
| 72 |
+
|
| 73 |
|
| 74 |
def run_demo(
|
| 75 |
+
prompt: str,
|
| 76 |
+
max_new: int,
|
| 77 |
+
model_id: str,
|
| 78 |
+
device_pref: str,
|
| 79 |
) -> tuple[str, str, str, str, str]:
|
| 80 |
+
device = "cuda" if (device_pref == "auto" and torch.cuda.is_available()) else (
|
| 81 |
+
"cuda" if device_pref == "cuda" else "cpu"
|
| 82 |
+
)
|
| 83 |
+
tok, model = _load_model(model_id, device)
|
| 84 |
+
|
| 85 |
+
cfg = model.config
|
| 86 |
+
num_hidden_layers = cfg.num_hidden_layers
|
| 87 |
+
head_dim = getattr(cfg, "head_dim", cfg.hidden_size // cfg.num_attention_heads)
|
| 88 |
+
bf16_bits = head_dim * 16 # reference: bits per token per head in bf16
|
| 89 |
+
|
| 90 |
+
results = []
|
| 91 |
+
|
| 92 |
+
baseline_cache = DynamicCache()
|
| 93 |
+
text_bf16, t_bf16 = _generate_one(tok, model, prompt, max_new, baseline_cache, device)
|
| 94 |
+
results.append(("bf16 DynamicCache (reference)", text_bf16, t_bf16, bf16_bits))
|
| 95 |
+
|
| 96 |
+
for q, label in [
|
| 97 |
+
(10, "E8 Q=10 aggressive"),
|
| 98 |
+
(38, "E8 Q=38 balanced"),
|
| 99 |
+
(152, "E8 Q=152 near-lossless"),
|
| 100 |
+
]:
|
| 101 |
+
try:
|
| 102 |
+
cache = KakeyaLatticeCache(
|
| 103 |
+
variant="e8", q_range=q,
|
| 104 |
+
num_hidden_layers=num_hidden_layers,
|
| 105 |
+
head_dim=head_dim,
|
| 106 |
+
device=device,
|
| 107 |
+
strict=False,
|
| 108 |
+
)
|
| 109 |
+
text, t = _generate_one(tok, model, prompt, max_new, cache, device)
|
| 110 |
+
bits = cache._codecs[0].bits_per_token_per_head if cache._codecs else bf16_bits
|
| 111 |
+
results.append((f"KakeyaLattice {label}", text, t, bits))
|
| 112 |
+
except Exception as e:
|
| 113 |
+
results.append((f"KakeyaLattice {label} (FAILED)", f"Error: {e}", 0.0, 0))
|
| 114 |
+
|
| 115 |
+
header = (
|
| 116 |
+
f"**Model:** `{model_id}` | **head_dim:** {head_dim} | "
|
| 117 |
+
f"**device:** {device} | **new_tokens:** {max_new} | "
|
| 118 |
+
f"**bf16 reference bits/vec:** {bf16_bits}"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
rows = []
|
| 122 |
+
for (name, text, t, bits) in results:
|
| 123 |
+
if bits > 0:
|
| 124 |
+
cr = bf16_bits / bits
|
| 125 |
+
bit_saving = (1 - bits / bf16_bits) * 100
|
| 126 |
+
cr_str = f"{cr:.2f}x"
|
| 127 |
+
cr_detail = f"{bit_saving:+.0f}% bits vs bf16"
|
| 128 |
+
else:
|
| 129 |
+
cr_str = "n/a"
|
| 130 |
+
cr_detail = "failed"
|
| 131 |
+
rows.append(
|
| 132 |
+
f"\n### {name}\n\n"
|
| 133 |
+
f"- **latency:** {t:.2f}s\n"
|
| 134 |
+
f"- **bits/vec:** {bits} (bf16 ref: {bf16_bits})\n"
|
| 135 |
+
f"- **Compression:** {cr_str} ({cr_detail})\n\n"
|
| 136 |
+
f"{text}"
|
| 137 |
+
)
|
| 138 |
+
return header, *rows
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
EXAMPLE_PROMPTS = [
|
| 142 |
+
["List five countries in Africa:"],
|
| 143 |
+
["Translate 'good morning' into French, Spanish, German, and Japanese:"],
|
| 144 |
+
["Write a two-sentence summary of what a transformer is in machine learning:"],
|
| 145 |
+
["What is 17 times 23? Show your work step by step."],
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
|
| 149 |
with gr.Blocks(title="KakeyaLattice KV-cache compression demo") as demo:
|
| 150 |
+
gr.Markdown(
|
| 151 |
+
"# KakeyaLattice KV-cache compression demo\n\n"
|
| 152 |
+
"Compare generation output + latency across **bf16 baseline** and "
|
| 153 |
+
"three **KakeyaLattice E8** compression levels on a small HF causal LM. "
|
| 154 |
+
"The E8 variant uses 8-D nested-lattice closest-point quantisation "
|
| 155 |
+
"with Sylvester-Hadamard rotation and per-vector adaptive scaling."
|
| 156 |
+
)
|
| 157 |
+
with gr.Row():
|
| 158 |
+
prompt = gr.Textbox(
|
| 159 |
+
label="Prompt",
|
| 160 |
+
value=DEFAULT_PROMPT,
|
| 161 |
+
lines=3,
|
| 162 |
+
)
|
| 163 |
+
with gr.Row():
|
| 164 |
+
max_new = gr.Slider(minimum=16, maximum=512, value=128, step=16, label="Max new tokens")
|
| 165 |
+
model_id = gr.Textbox(label="HF model id", value=DEFAULT_MODEL)
|
| 166 |
+
device_pref = gr.Radio(choices=["auto", "cpu", "cuda"], value="auto", label="Device")
|
| 167 |
+
run_btn = gr.Button("Run comparison", variant="primary")
|
| 168 |
+
|
| 169 |
+
gr.Examples(
|
| 170 |
+
examples=EXAMPLE_PROMPTS,
|
| 171 |
+
inputs=[prompt],
|
| 172 |
+
label="Example prompts (click to fill)",
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
gr.Markdown(
|
| 176 |
+
"### About the default model\n\n"
|
| 177 |
+
f"The default model is **{DEFAULT_MODEL}** (0.5B params). It runs on a "
|
| 178 |
+
"free HF Space CPU but is *small*. Small models can fall into "
|
| 179 |
+
"greedy-decode repetition loops on open-ended prompts — that is a "
|
| 180 |
+
"property of the **model**, not the codec. If you see all four outputs "
|
| 181 |
+
"repeating the same phrase, try a short, fact-shaped prompt (e.g. "
|
| 182 |
+
"\"List five countries in Africa:\") or switch to a larger model "
|
| 183 |
+
"(`KAKEYA_DEMO_MODEL=Qwen/Qwen2.5-1.5B`) on a GPU Space."
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
header_out = gr.Markdown("")
|
| 187 |
+
out_bf16 = gr.Markdown("")
|
| 188 |
+
out_q10 = gr.Markdown("")
|
| 189 |
+
out_q38 = gr.Markdown("")
|
| 190 |
+
out_q152 = gr.Markdown("")
|
| 191 |
+
run_btn.click(
|
| 192 |
+
fn=run_demo,
|
| 193 |
+
inputs=[prompt, max_new, model_id, device_pref],
|
| 194 |
+
outputs=[header_out, out_bf16, out_q10, out_q38, out_q152],
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|
| 199 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
|
|
|
|
|
| 1 |
kakeyalattice[hf]>=1.5.0
|
| 2 |
gradio>=4.44
|
| 3 |
transformers>=4.45
|
|
|
|
| 4 |
torch>=2.1
|
|
|
|
|
|
|
| 5 |
sentencepiece>=0.2.0
|
| 6 |
-
tiktoken>=0.7.0
|
|
|
|
| 1 |
+
# Pinned for reproducible Docker builds on HF Space.
|
| 2 |
+
# Loose-pinned (>=) so security patches land automatically.
|
| 3 |
kakeyalattice[hf]>=1.5.0
|
| 4 |
gradio>=4.44
|
| 5 |
transformers>=4.45
|
| 6 |
+
# CPU torch (via --extra-index-url https://download.pytorch.org/whl/cpu in Dockerfile)
|
| 7 |
torch>=2.1
|
| 8 |
+
# Deps that transformers pulls but we want explicit for the free-CPU Space
|
| 9 |
+
# (Qwen2 / LLaMA tokenizers):
|
| 10 |
sentencepiece>=0.2.0
|
| 11 |
+
tiktoken>=0.7.0
|