Spaces:
Running
Running
File size: 3,095 Bytes
9b38236 77bb0fc 0e5a78d 77bb0fc 0e5a78d 77bb0fc 9b38236 0e5a78d 9b38236 77bb0fc 9b38236 77bb0fc 9b38236 36a071f 9b38236 77bb0fc 9b38236 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
from transformers import AutoConfig
def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
hf_token = hf_token.strip()
try:
cfg = AutoConfig.from_pretrained(
name,
trust_remote_code=True,
token=hf_token or None,
)
except Exception as e:
raise gr.Error(e)
use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
if hasattr(cfg, "text_config"):
cfg = cfg.text_config
num_layers = cfg.num_hidden_layers
model_config = [
["num_layers", num_layers],
["max_ctx_len", cfg.max_position_embeddings],
]
if ctx_len > cfg.max_position_embeddings:
gr.Warning(
"Requested context length is larger than the max value supported by the model"
)
# TODO: show attention type, show calculation
if use_mla:
kv_lora_rank = cfg.kv_lora_rank
qk_rope_head_dim = cfg.qk_rope_head_dim
nelems_per_token = num_layers * (kv_lora_rank + qk_rope_head_dim)
model_config.append(["kv_lora_rank", kv_lora_rank])
model_config.append(["qk_rope_head_dim", qk_rope_head_dim])
else:
num_kv_heads = cfg.num_key_value_heads
head_dim = getattr(cfg, "head_dim", cfg.hidden_size // cfg.num_attention_heads)
nelems_per_token = num_layers * num_kv_heads * head_dim * 2
model_config.append(["num_kv_heads", num_kv_heads])
model_config.append(["head_dim", head_dim])
if dtype == "fp16/bf16":
nbytes_per_elem = 2
elif dtype == "fp8":
nbytes_per_elem = 1 + 2 / cfg.hidden_size # assume per-token scaling
kv_cache_size = nelems_per_token * ctx_len * num_users * nbytes_per_elem / 1e9
return kv_cache_size, model_config
DESCRIPTION = (
"NOTE:\n"
" - For gated repos, you will need to provide your HF token in the box below. You can "
"generate a new one at https://huggingface.co/settings/tokens. The token won't be stored "
"(you can check `app.py`).\n"
" - We don't take into account KV cache savings from sliding window attention (most "
"serving frameworks don't optimize for this anyway?)\n"
" - For Multi-head Latent Attention (MLA) used in DeepSeek-V2/V3, we calculate the "
"compressed KV cache as intended by MLA. This might not be supported on certain framework"
"+hardware combinations e.g. llama.cpp, MLX, which will fallback to Multi-head Attention "
"(MHA)."
)
demo = gr.Interface(
description=DESCRIPTION,
fn=calculate,
inputs=[
gr.Textbox(label="model_id", value="Qwen/QwQ-32B"),
gr.Number(label="Context length", value=128_000),
gr.Number(label="No. of users", value=1),
gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
gr.Textbox(label="HF token"),
],
outputs=[
gr.Number(label="KV cache size (GB)", precision=2),
gr.Dataframe(
label="Model config", headers=["Key", "Value"], datatype=["str", "int"]
),
],
)
demo.launch()
|