gaunernst commited on
Commit
77bb0fc
·
1 Parent(s): f9ee628

add hf_token. add description

Browse files
Files changed (1) hide show
  1. app.py +25 -4
app.py CHANGED
@@ -2,10 +2,16 @@ import gradio as gr
2
  from transformers import AutoConfig
3
 
4
 
5
- # TODO: access token for gated repo
6
- # TODO: add note about sliding window attention
7
- def calculate(name: str, ctx_len: int, num_users: int, dtype: str):
8
- cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
 
 
 
 
 
 
9
  use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
10
 
11
  if hasattr(cfg, "text_config"):
@@ -43,13 +49,28 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str):
43
  return kv_cache_size, model_config
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  demo = gr.Interface(
 
47
  fn=calculate,
48
  inputs=[
49
  gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
50
  gr.Number(label="Context length", value=128_000),
51
  gr.Number(label="No. of users", value=1),
52
  gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
 
53
  ],
54
  outputs=[
55
  gr.Number(label="KV cache size (GB)", precision=2),
 
2
  from transformers import AutoConfig
3
 
4
 
5
+ def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
6
+ try:
7
+ cfg = AutoConfig.from_pretrained(
8
+ name,
9
+ trust_remote_code=True,
10
+ token=hf_token,
11
+ )
12
+ except Exception as e:
13
+ raise gr.Error(e)
14
+
15
  use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
16
 
17
  if hasattr(cfg, "text_config"):
 
49
  return kv_cache_size, model_config
50
 
51
 
52
+ DESCRIPTION = (
53
+ "NOTE:\n"
54
+ " - For gated repos, you will need to provide your HF token in the box below. You can "
55
+ "generate a new one at https://huggingface.co/settings/tokens. The token won't be stored "
56
+ "(you can check `app.py`).\n"
57
+ " - We don't take into account KV cache savings from sliding window attention (most "
58
+ "serving frameworks don't optimize for this anyway?)\n"
59
+ " - For Multi-head Latent Attention (MLA) used in DeepSeek-V2/V3, we calculate the "
60
+ "compressed KV cache as intended by MLA. This might not be supported on certain framework"
61
+ "+hardware combinations e.g. llama.cpp, MLX, which will fallback to Multi-head Attention "
62
+ "(MHA)."
63
+ )
64
+
65
  demo = gr.Interface(
66
+ description=DESCRIPTION,
67
  fn=calculate,
68
  inputs=[
69
  gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
70
  gr.Number(label="Context length", value=128_000),
71
  gr.Number(label="No. of users", value=1),
72
  gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
73
+ gr.Textbox(label="HF token"),
74
  ],
75
  outputs=[
76
  gr.Number(label="KV cache size (GB)", precision=2),