Switch default model to Qwen3-0.6B (head_dim=128, GQA); bump transformers>=4.51
Browse files- README.md +19 -11
- app.py +17 -11
- requirements.txt +1 -1
README.md
CHANGED
|
@@ -15,9 +15,12 @@ Side-by-side comparison of **bf16 DynamicCache** vs **KakeyaLattice E8**
|
|
| 15 |
compression at three quality levels (Q=10 aggressive, Q=38 balanced,
|
| 16 |
Q=152 near-lossless) on a small HuggingFace causal LM.
|
| 17 |
|
| 18 |
-
Default model: `Qwen/
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
## How it works
|
| 23 |
|
|
@@ -29,7 +32,7 @@ decode) to every K and V written into the cache.
|
|
| 29 |
from transformers import AutoModelForCausalLM
|
| 30 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 31 |
|
| 32 |
-
model = AutoModelForCausalLM.from_pretrained("Qwen/
|
| 33 |
cache = KakeyaLatticeCache(
|
| 34 |
variant="e8", q_range=38,
|
| 35 |
num_hidden_layers=model.config.num_hidden_layers,
|
|
@@ -40,14 +43,19 @@ out = model.generate(input_ids, max_new_tokens=200, past_key_values=cache)
|
|
| 40 |
|
| 41 |
## What you'll see in the demo
|
| 42 |
|
| 43 |
-
For each prompt, the app generates four times
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
| config | bits/
|
| 46 |
-
| ------------------------ | -----------------
|
| 47 |
-
| bf16 DynamicCache |
|
| 48 |
-
| E8 Q=152 near-lossless | ~
|
| 49 |
-
| E8 Q=38 balanced | ~
|
| 50 |
-
| E8 Q=10 aggressive | ~
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
Wall-clock latency per config is also reported.
|
| 53 |
|
|
|
|
| 15 |
compression at three quality levels (Q=10 aggressive, Q=38 balanced,
|
| 16 |
Q=152 near-lossless) on a small HuggingFace causal LM.
|
| 17 |
|
| 18 |
+
Default model: `Qwen/Qwen3-0.6B` (head_dim=128, GQA 16/8 β the same
|
| 19 |
+
attention shape as modern production LLMs, so the codec numbers are
|
| 20 |
+
representative). Runs on the free CPU tier (each "Run comparison"
|
| 21 |
+
click takes ~4β8 minutes on 2 cores). Override `KAKEYA_DEMO_MODEL`
|
| 22 |
+
env var to use a larger model on a GPU Space (`Qwen/Qwen3-1.7B`,
|
| 23 |
+
`Qwen/Qwen3-4B`).
|
| 24 |
|
| 25 |
## How it works
|
| 26 |
|
|
|
|
| 32 |
from transformers import AutoModelForCausalLM
|
| 33 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 34 |
|
| 35 |
+
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")
|
| 36 |
cache = KakeyaLatticeCache(
|
| 37 |
variant="e8", q_range=38,
|
| 38 |
num_hidden_layers=model.config.num_hidden_layers,
|
|
|
|
| 43 |
|
| 44 |
## What you'll see in the demo
|
| 45 |
|
| 46 |
+
For each prompt, the app generates four times (bits/vec here assume
|
| 47 |
+
head_dim=128 β bf16 baseline is 2048 bits/vec; exact numbers for other
|
| 48 |
+
head_dims scale proportionally):
|
| 49 |
|
| 50 |
+
| config | bits/vec (head_dim=128) | expected quality |
|
| 51 |
+
| ------------------------ | ----------------------- | --------------------------------- |
|
| 52 |
+
| bf16 DynamicCache | 2048 (reference) | identical to reference |
|
| 53 |
+
| E8 Q=152 near-lossless | ~1920 (-6%) | essentially identical |
|
| 54 |
+
| E8 Q=38 balanced | ~880 (-57%) | ~1% deviation in ppl |
|
| 55 |
+
| E8 Q=10 aggressive | ~640 (-69%) | noticeably different but coherent |
|
| 56 |
+
|
| 57 |
+
(The percentage savings `-6% / -57% / -69%` are what matter β they are
|
| 58 |
+
fixed by the E8 codec design and do not depend on head_dim.)
|
| 59 |
|
| 60 |
Wall-clock latency per config is also reported.
|
| 61 |
|
app.py
CHANGED
|
@@ -5,9 +5,10 @@ Run locally:
|
|
| 5 |
python app.py
|
| 6 |
|
| 7 |
Deploy to HF Spaces: see ./SPACE_README.md and ./HF_SPACE_DEPLOY.md.
|
| 8 |
-
By default uses
|
| 9 |
-
free HF Space CPU
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
The demo shows, side-by-side, the same prompt generated under:
|
| 13 |
(a) bf16 DynamicCache β reference
|
|
@@ -33,7 +34,7 @@ except ImportError as e:
|
|
| 33 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 34 |
|
| 35 |
|
| 36 |
-
DEFAULT_MODEL = os.environ.get("KAKEYA_DEMO_MODEL", "Qwen/
|
| 37 |
DEFAULT_PROMPT = "List five countries in Africa:"
|
| 38 |
_model_cache: dict = {}
|
| 39 |
|
|
@@ -174,13 +175,18 @@ with gr.Blocks(title="KakeyaLattice KV-cache compression") as demo:
|
|
| 174 |
|
| 175 |
gr.Markdown(
|
| 176 |
"### About the default model\n\n"
|
| 177 |
-
f"The default model is **{DEFAULT_MODEL}** (0.
|
| 178 |
-
"free HF Space CPU
|
| 179 |
-
"
|
| 180 |
-
"
|
| 181 |
-
"
|
| 182 |
-
"
|
| 183 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
header_out = gr.Markdown("")
|
|
|
|
| 5 |
python app.py
|
| 6 |
|
| 7 |
Deploy to HF Spaces: see ./SPACE_README.md and ./HF_SPACE_DEPLOY.md.
|
| 8 |
+
By default uses Qwen3-0.6B (head_dim=128, GQA 16/8, E8-compatible) β
|
| 9 |
+
fits on a free HF Space CPU and is architecturally closer to production
|
| 10 |
+
LLMs than Qwen2-0.5B. Swap to Qwen/Qwen3-1.7B or Qwen/Qwen3-4B
|
| 11 |
+
(GPU Space) for faster / longer comparisons.
|
| 12 |
|
| 13 |
The demo shows, side-by-side, the same prompt generated under:
|
| 14 |
(a) bf16 DynamicCache β reference
|
|
|
|
| 34 |
from kakeyalattice.hf import KakeyaLatticeCache
|
| 35 |
|
| 36 |
|
| 37 |
+
DEFAULT_MODEL = os.environ.get("KAKEYA_DEMO_MODEL", "Qwen/Qwen3-0.6B")
|
| 38 |
DEFAULT_PROMPT = "List five countries in Africa:"
|
| 39 |
_model_cache: dict = {}
|
| 40 |
|
|
|
|
| 175 |
|
| 176 |
gr.Markdown(
|
| 177 |
"### About the default model\n\n"
|
| 178 |
+
f"The default model is **{DEFAULT_MODEL}** (0.6B params, head_dim=128, "
|
| 179 |
+
"GQA 16/8). It runs on a free HF Space CPU in roughly 4β8 minutes per "
|
| 180 |
+
"'Run comparison' click (four generations Γ ~128 tokens each on 2 "
|
| 181 |
+
"cores). That is slow but deliberate: Qwen3's head_dim=128 + GQA is "
|
| 182 |
+
"the same shape used by most production LLMs, so the E8 codec numbers "
|
| 183 |
+
"you see here are representative.\n\n"
|
| 184 |
+
"Small models can still fall into greedy-decode repetition loops on "
|
| 185 |
+
"open-ended prompts β that is a property of the **model**, not the "
|
| 186 |
+
"codec. If you see all four outputs repeating the same phrase, try a "
|
| 187 |
+
"short, fact-shaped prompt (e.g. \"List five countries in Africa:\"). "
|
| 188 |
+
"For faster decode / larger context, switch to a GPU Space and set "
|
| 189 |
+
"`KAKEYA_DEMO_MODEL=Qwen/Qwen3-1.7B` or `Qwen/Qwen3-4B`."
|
| 190 |
)
|
| 191 |
|
| 192 |
header_out = gr.Markdown("")
|
requirements.txt
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# Loose-pinned (>=) so security patches land automatically.
|
| 3 |
kakeyalattice[hf]>=1.5.0
|
| 4 |
gradio>=4.44
|
| 5 |
-
transformers>=4.
|
| 6 |
# CPU torch (via --extra-index-url https://download.pytorch.org/whl/cpu in Dockerfile)
|
| 7 |
torch>=2.1
|
| 8 |
# Deps that transformers pulls but we want explicit for the free-CPU Space
|
|
|
|
| 2 |
# Loose-pinned (>=) so security patches land automatically.
|
| 3 |
kakeyalattice[hf]>=1.5.0
|
| 4 |
gradio>=4.44
|
| 5 |
+
transformers>=4.51 # Qwen3ForCausalLM requires 4.51+
|
| 6 |
# CPU torch (via --extra-index-url https://download.pytorch.org/whl/cpu in Dockerfile)
|
| 7 |
torch>=2.1
|
| 8 |
# Deps that transformers pulls but we want explicit for the free-CPU Space
|