Spaces:

gaunernst
/

kv-cache-calculator

Running

App Files Files Community

gaunernst commited on Apr 3

Commit

77bb0fc

1 Parent(s): f9ee628

add hf_token. add description

Browse files

Files changed (1) hide show

app.py +25 -4

app.py CHANGED Viewed

@@ -2,10 +2,16 @@ import gradio as gr
 from transformers import AutoConfig
-# TODO: access token for gated repo
-# TODO: add note about sliding window attention
-def calculate(name: str, ctx_len: int, num_users: int, dtype: str):
-    cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
     use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
     if hasattr(cfg, "text_config"):
@@ -43,13 +49,28 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str):
     return kv_cache_size, model_config
 demo = gr.Interface(
     fn=calculate,
     inputs=[
         gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
         gr.Number(label="Context length", value=128_000),
         gr.Number(label="No. of users", value=1),
         gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
     ],
     outputs=[
         gr.Number(label="KV cache size (GB)", precision=2),

 from transformers import AutoConfig
+def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
+    try:
+        cfg = AutoConfig.from_pretrained(
+            name,
+            trust_remote_code=True,
+            token=hf_token,
+        )
+    except Exception as e:
+        raise gr.Error(e)
     use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
     if hasattr(cfg, "text_config"):
     return kv_cache_size, model_config
+DESCRIPTION = (
+    "NOTE:\n"
+    " - For gated repos, you will need to provide your HF token in the box below. You can "
+    "generate a new one at https://huggingface.co/settings/tokens. The token won't be stored "
+    "(you can check `app.py`).\n"
+    " - We don't take into account KV cache savings from sliding window attention (most "
+    "serving frameworks don't optimize for this anyway?)\n"
+    " - For Multi-head Latent Attention (MLA) used in DeepSeek-V2/V3, we calculate the "
+    "compressed KV cache as intended by MLA. This might not be supported on certain framework"
+    "+hardware combinations e.g. llama.cpp, MLX, which will fallback to Multi-head Attention "
+    "(MHA)."
+)
 demo = gr.Interface(
+    description=DESCRIPTION,
     fn=calculate,
     inputs=[
         gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
         gr.Number(label="Context length", value=128_000),
         gr.Number(label="No. of users", value=1),
         gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
+        gr.Textbox(label="HF token"),
     ],
     outputs=[
         gr.Number(label="KV cache size (GB)", precision=2),