Spaces:
Running
Running
add hf_token. add description
Browse files
app.py
CHANGED
@@ -2,10 +2,16 @@ import gradio as gr
|
|
2 |
from transformers import AutoConfig
|
3 |
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
|
10 |
|
11 |
if hasattr(cfg, "text_config"):
|
@@ -43,13 +49,28 @@ def calculate(name: str, ctx_len: int, num_users: int, dtype: str):
|
|
43 |
return kv_cache_size, model_config
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
demo = gr.Interface(
|
|
|
47 |
fn=calculate,
|
48 |
inputs=[
|
49 |
gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
|
50 |
gr.Number(label="Context length", value=128_000),
|
51 |
gr.Number(label="No. of users", value=1),
|
52 |
gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
|
|
|
53 |
],
|
54 |
outputs=[
|
55 |
gr.Number(label="KV cache size (GB)", precision=2),
|
|
|
2 |
from transformers import AutoConfig
|
3 |
|
4 |
|
5 |
+
def calculate(name: str, ctx_len: int, num_users: int, dtype: str, hf_token: str):
|
6 |
+
try:
|
7 |
+
cfg = AutoConfig.from_pretrained(
|
8 |
+
name,
|
9 |
+
trust_remote_code=True,
|
10 |
+
token=hf_token,
|
11 |
+
)
|
12 |
+
except Exception as e:
|
13 |
+
raise gr.Error(e)
|
14 |
+
|
15 |
use_mla = cfg.architectures[0].startswith(("DeepseekV2", "DeepseekV3"))
|
16 |
|
17 |
if hasattr(cfg, "text_config"):
|
|
|
49 |
return kv_cache_size, model_config
|
50 |
|
51 |
|
52 |
+
DESCRIPTION = (
|
53 |
+
"NOTE:\n"
|
54 |
+
" - For gated repos, you will need to provide your HF token in the box below. You can "
|
55 |
+
"generate a new one at https://huggingface.co/settings/tokens. The token won't be stored "
|
56 |
+
"(you can check `app.py`).\n"
|
57 |
+
" - We don't take into account KV cache savings from sliding window attention (most "
|
58 |
+
"serving frameworks don't optimize for this anyway?)\n"
|
59 |
+
" - For Multi-head Latent Attention (MLA) used in DeepSeek-V2/V3, we calculate the "
|
60 |
+
"compressed KV cache as intended by MLA. This might not be supported on certain framework"
|
61 |
+
"+hardware combinations e.g. llama.cpp, MLX, which will fallback to Multi-head Attention "
|
62 |
+
"(MHA)."
|
63 |
+
)
|
64 |
+
|
65 |
demo = gr.Interface(
|
66 |
+
description=DESCRIPTION,
|
67 |
fn=calculate,
|
68 |
inputs=[
|
69 |
gr.Textbox(label="model_id", value="google/gemma-3-1b-it"),
|
70 |
gr.Number(label="Context length", value=128_000),
|
71 |
gr.Number(label="No. of users", value=1),
|
72 |
gr.Dropdown(label="KV cache dtype", choices=["fp16/bf16", "fp8"]),
|
73 |
+
gr.Textbox(label="HF token"),
|
74 |
],
|
75 |
outputs=[
|
76 |
gr.Number(label="KV cache size (GB)", precision=2),
|