Molbap HF Staff commited on
Commit
67910fd
·
1 Parent(s): 0ce686a
Files changed (1) hide show
  1. app.py +164 -175
app.py CHANGED
@@ -1,195 +1,184 @@
1
- import os, json, subprocess, sys, textwrap, tempfile, shlex, pandas as pd
2
-
3
  import gradio as gr
 
 
4
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # --- Attention mask visualizer (Transformers) ---
7
- # Docs show: from transformers.utils.attention_visualizer import AttentionMaskVisualizer
8
- # Ref: https://huggingface.co/docs/transformers/... pages mention this util.
9
- def _import_visualizer():
10
- from transformers.utils.attention_visualizer import AttentionMaskVisualizer # type: ignore[attr-defined]
11
  return AttentionMaskVisualizer
12
 
13
  @spaces.GPU(duration=120)
14
- def run_attention_visualizer(model_id: str, prompt: str) -> str:
15
- """
16
- Returns HTML produced by AttentionMaskVisualizer(model_id)(prompt).
17
- We render it into an HTML component.
18
- """
19
- AttentionMaskVisualizer = _import_visualizer()
20
  vis = AttentionMaskVisualizer(model_id)
21
- html_or_obj = vis(prompt) # recent Transformers returns embeddable HTML
22
- return str(html_or_obj)
23
-
24
- # --- Minimal “terminal” (sandboxed) ---
25
- def run_shell(cmd: str) -> str:
26
- # Simple, constrained shell: block backgrounding, pipes, redirects; allow common tooling.
27
- blocked = any(tok in cmd for tok in ["|", ">", "<", "&&", "||", "`"])
28
- if blocked:
29
- return "Blocked characters detected. Use a single command without pipes/redirections."
30
- try:
31
- out = subprocess.run(
32
- cmd, shell=True, check=False, capture_output=True, text=True, timeout=30
33
- )
34
- return f"$ {cmd}\n{out.stdout}{out.stderr}"
35
- except Exception as e:
36
- return f"$ {cmd}\n{e!r}"
37
 
38
- # --- KV-cache / CUDA caching allocator profiling ---
39
- # We launch a short Python program twice (allocator on/off) in a subprocess so the env var takes effect pre-import.
40
- PROFILE_SNIPPET = r"""
41
- import os, json, time, torch
42
- from transformers import AutoTokenizer, AutoModelForCausalLM
43
-
44
- model_id = os.environ.get("HF_MODEL_ID", "openai-community/gpt2")
45
- device = "cuda" if torch.cuda.is_available() else "cpu"
46
- tok = AutoTokenizer.from_pretrained(model_id)
47
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else None).to(device)
48
-
49
- prompt = os.environ.get("HF_PROMPT", "Transformers are great for sequence modeling.")
50
- steps = int(os.environ.get("HF_STEPS", "32"))
51
-
52
- inputs = tok(prompt, return_tensors="pt").to(device)
53
- if device == "cuda":
54
- torch.cuda.reset_peak_memory_stats()
55
- torch.cuda.synchronize()
56
-
57
- def mem():
58
- if device != "cuda":
59
- return {"allocated": 0, "reserved": 0}
60
- return {
61
- "allocated": int(torch.cuda.memory_allocated()),
62
- "reserved": int(torch.cuda.memory_reserved()),
63
- }
64
-
65
- print(json.dumps({"t": 0, **mem()}), flush=True)
66
-
67
- # Step-by-step generation to grow KV cache
68
- past = None
69
- input_ids = inputs.input_ids
70
- for i in range(1, steps+1):
71
- with torch.inference_mode():
72
- out = model(input_ids=input_ids, use_cache=True, past_key_values=past)
73
- past = out.past_key_values
74
- # feed a single token next (use eos or last predicted token if available)
75
- next_id = torch.tensor([[tok.eos_token_id or tok.pad_token_id or 0]], device=device)
76
- input_ids = next_id
77
- if device == "cuda":
78
- torch.cuda.synchronize()
79
- print(json.dumps({"t": i, **mem()}), flush=True)
80
- """
81
 
82
- def _run_profile_once(model_id: str, prompt: str, steps: int, disable_cache: bool) -> list[dict]:
83
- env = os.environ.copy()
84
- env["HF_MODEL_ID"] = model_id
85
- env["HF_PROMPT"] = prompt
86
- env["HF_STEPS"] = str(steps)
87
- # IMPORTANT: set before torch import in the child
88
- if disable_cache:
89
- env["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
90
- with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
91
- f.write(PROFILE_SNIPPET)
92
- path = f.name
93
  try:
94
- p = subprocess.Popen(
95
- [sys.executable, path],
96
- stdout=subprocess.PIPE,
97
- stderr=subprocess.PIPE,
98
- text=True,
99
- env=env,
100
- )
101
- out_lines = []
102
- assert p.stdout is not None
103
- for line in p.stdout:
104
- line = line.strip()
105
- if not line:
106
- continue
107
- try:
108
- out_lines.append(json.loads(line))
109
- except json.JSONDecodeError:
110
- # ignore stray prints from HF / torch
111
- pass
112
- p.wait(timeout=300)
113
- return out_lines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  finally:
115
- try:
116
- os.remove(path)
117
- except OSError:
118
- pass
119
-
120
- @spaces.GPU(duration=180)
121
- def profile_allocator(model_id: str, prompt: str, steps: int):
122
- """Return a DataFrame ready for gr.LinePlot: t, MiB, kind, mode."""
123
- on = _run_profile_once(model_id, prompt, steps, disable_cache=False)
124
- off = _run_profile_once(model_id, prompt, steps, disable_cache=True)
125
-
126
- def rows(series, mode):
127
- for rec in series:
128
- t = rec.get("t", 0)
129
- allocated = rec.get("allocated", 0) / (1024**2)
130
- reserved = rec.get("reserved", 0) / (1024**2)
131
- yield {"t": t, "MiB": allocated, "kind": "allocated", "mode": mode}
132
- yield {"t": t, "MiB": reserved, "kind": "reserved", "mode": mode}
133
-
134
- df = pd.DataFrame(list(rows(on, "caching ON")) + list(rows(off, "caching OFF")))
135
- return df
136
-
137
- # --- UI ---
138
- with gr.Blocks(fill_height=True) as demo:
139
- gr.Markdown(
140
- textwrap.dedent("""
141
- ### Transformers feature showcase (ZeroGPU-ready)
142
- - Attention mask visualizer
143
- - Minimal terminal
144
- - KV cache vs. CUDA caching allocator memory plot
145
- """).strip()
146
- )
147
-
148
- with gr.Tabs():
149
- with gr.Tab("Attention mask visualizer"):
150
- with gr.Row():
151
- model_dd = gr.Dropdown(
152
- label="Model",
153
- choices=[
154
- "openai-community/gpt2",
155
- "google/gemma-2-2b", # heavier; try if bandwidth allows
156
- ],
157
- value="openai-community/gpt2",
158
- allow_custom_value=True,
159
- )
160
- prompt_tb = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
161
- run_btn = gr.Button("Render")
162
- html_out = gr.HTML()
163
- run_btn.click(run_attention_visualizer, inputs=[model_dd, prompt_tb], outputs=html_out)
164
-
165
- with gr.Tab("Terminal (simplified)"):
166
- cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
167
- run_b = gr.Button("Run")
168
- out = gr.Textbox(label="Output", lines=18, interactive=False)
169
- run_b.click(run_shell, inputs=cmd, outputs=out)
170
-
171
- with gr.Tab("Cache allocator plot"):
172
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  model_mem = gr.Dropdown(
174
  label="Model",
175
- choices=["openai-community/gpt2"],
176
  value="openai-community/gpt2",
177
  allow_custom_value=True,
178
  )
179
- prompt_mem = gr.Textbox(label="Prompt", value="A short test prompt.")
180
- steps = gr.Slider(8, 128, value=32, step=1, label="Steps (tokens)")
181
- go = gr.Button("Profile")
182
- df_out = gr.Dataframe(visible=False) # optional debugging
183
- plot = gr.LinePlot(
184
- x="t", y="MiB", color="mode", overlay_point=True,
185
- title="GPU memory over steps (allocated vs reserved; caching ON vs OFF)",
186
- group="kind", tooltip=["t", "MiB", "kind", "mode"], width=900, height=450
187
- )
188
- go.click(profile_allocator, inputs=[model_mem, prompt_mem, steps], outputs=plot)
189
-
190
- # Placeholder for a future FastRTC tab; the Space structure supports it.
191
- # See: https://www.gradio.app/guides/create-immersive-demo (WebRTC + Stream with FastRTC)
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
  demo.launch()
195
-
 
1
+ import os, sys, time, threading, subprocess, json, textwrap, tempfile
 
2
  import gradio as gr
3
+ import pandas as pd
4
+
5
  import spaces
6
+ import torch
7
+
8
+ # --- Minimal safe terminal ---
9
+ def run_shell(cmd: str) -> str:
10
+ banned = ["|", ">", "<", "&&", "||", "`"]
11
+ if any(b in cmd for b in banned):
12
+ return "$ " + cmd + "\nBlocked characters. Use a single command."
13
+ try:
14
+ p = subprocess.run(cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
15
+ return f"$ {cmd}\n{p.stdout}{p.stderr}"
16
+ except Exception as e:
17
+ return f"$ {cmd}\n{e!r}"
18
 
19
  # --- Attention mask visualizer (Transformers) ---
20
+ def _import_attention_visualizer():
21
+ # Available in recent transformers (utils.attention_visualizer)
22
+ from transformers.utils.attention_visualizer import AttentionMaskVisualizer # noqa: F401
 
23
  return AttentionMaskVisualizer
24
 
25
  @spaces.GPU(duration=120)
26
+ def render_attention_mask(model_id: str, prompt: str) -> str:
27
+ AttentionMaskVisualizer = _import_attention_visualizer()
 
 
 
 
28
  vis = AttentionMaskVisualizer(model_id)
29
+ out = vis(prompt) # returns embeddable HTML or an object with _repr_html_
30
+ return str(out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # --- Transformers caching allocator warmup: time vs memory_allocated() ---
33
+ from transformers import AutoModelForCausalLM, modeling_utils as MU # noqa: E402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def _measure_load_timeline(model_id: str, disable_warmup: bool):
36
+ orig = MU.caching_allocator_warmup
37
+ if disable_warmup:
38
+ MU.caching_allocator_warmup = lambda *a, **k: None
 
 
 
 
 
 
 
39
  try:
40
+ device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ tl = []
42
+
43
+ def sample(start_t, stop_evt):
44
+ while not stop_evt.is_set():
45
+ if device == "cuda":
46
+ torch.cuda.synchronize()
47
+ alloc = torch.cuda.memory_allocated()
48
+ else:
49
+ alloc = 0
50
+ tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
51
+ time.sleep(0.05)
52
+
53
+ if device == "cuda":
54
+ torch.cuda.empty_cache()
55
+ torch.cuda.reset_peak_memory_stats()
56
+
57
+ start = time.perf_counter()
58
+ stop_evt = threading.Event()
59
+ th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
60
+ th.start()
61
+
62
+ kwargs = {}
63
+ if device == "cuda":
64
+ kwargs.update(dict(torch_dtype=torch.float16, device_map="cuda:0", low_cpu_mem_usage=True))
65
+ model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
66
+
67
+ stop_evt.set()
68
+ th.join()
69
+
70
+ if device == "cuda":
71
+ torch.cuda.synchronize()
72
+ tl.append({"t": time.perf_counter() - start, "MiB": torch.cuda.memory_allocated() / (1024**2)})
73
+
74
+ del model
75
+ if device == "cuda":
76
+ torch.cuda.empty_cache()
77
+ torch.cuda.ipc_collect()
78
+
79
+ return tl
80
  finally:
81
+ MU.caching_allocator_warmup = orig
82
+
83
+ @spaces.GPU(duration=240)
84
+ def profile_warmup(model_id: str):
85
+ on = _measure_load_timeline(model_id, disable_warmup=False)
86
+ off = _measure_load_timeline(model_id, disable_warmup=True)
87
+ rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup ON"} for r in on] + \
88
+ [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup OFF"} for r in off]
89
+ return pd.DataFrame(rows)
90
+
91
+ # --- (Optional) FastRTC demo: simple loopback for structure; expand later ---
92
+ # Requires camera permissions in the browser.
93
+ try:
94
+ from fastrtc import WebRTC, ReplyOnPause # type: ignore
95
+ def _echo_video(frame):
96
+ yield frame
97
+ HAS_FASTRTC = True
98
+ except Exception:
99
+ HAS_FASTRTC = False
100
+
101
+ # --- CSS for anchored, scrollable “playbook” layout ---
102
+ CSS = """
103
+ :root { --toc-w: 280px; }
104
+ #layout { display: grid; grid-template-columns: var(--toc-w) 1fr; gap: 1.25rem; }
105
+ #toc { position: sticky; top: 0.75rem; height: calc(100vh - 1.5rem); overflow: auto; padding-right: .5rem; }
106
+ #toc a { text-decoration: none; display: block; padding: .25rem 0; }
107
+ .section { scroll-margin-top: 72px; }
108
+ .gradio-container { max-width: 1200px !important; margin: 0 auto; }
109
+ hr { border: none; border-top: 1px solid var(--neutral-300); margin: 1.25rem 0; }
110
+ """
111
+
112
+ with gr.Blocks(css=CSS, fill_height=True, title="Transformers Feature Showcase (ZeroGPU)") as demo:
113
+ gr.HTML("<h1>Transformers Feature Showcase</h1><p>Interactive, scrollable demo.</p>")
114
+ with gr.Row(elem_id="layout"):
115
+ # TOC
116
+ with gr.Column(scale=0):
117
+ gr.HTML(
118
+ """
119
+ <nav id="toc">
120
+ <h3>Sections</h3>
121
+ <a href="#terminal">Terminal</a>
122
+ <a href="#attention">Attention mask visualizer</a>
123
+ <a href="#allocator">Allocator warmup timeline</a>
124
+ <a href="#rtc">FastRTC (preview)</a>
125
+ </nav>
126
+ """
127
+ )
128
+ # Content
129
+ with gr.Column():
130
+ # Terminal
131
+ gr.HTML('<h2 id="terminal" class="section">Terminal</h2>')
132
+ with gr.Group():
133
+ cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
134
+ run_btn = gr.Button("Run")
135
+ out = gr.Textbox(label="Output", lines=12)
136
+ run_btn.click(run_shell, inputs=cmd, outputs=out)
137
+ gr.HTML("<hr/>")
138
+
139
+ # Attention visualizer
140
+ gr.HTML('<h2 id="attention" class="section">Attention mask visualizer</h2>')
141
+ with gr.Group():
142
+ with gr.Row():
143
+ model_vis = gr.Dropdown(
144
+ label="Model",
145
+ choices=["openai-community/gpt2", "google/gemma-2-2b"],
146
+ value="openai-community/gpt2",
147
+ allow_custom_value=True,
148
+ )
149
+ prompt_vis = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
150
+ go_vis = gr.Button("Render")
151
+ html_vis = gr.HTML()
152
+ go_vis.click(render_attention_mask, inputs=[model_vis, prompt_vis], outputs=html_vis)
153
+ gr.HTML("<hr/>")
154
+
155
+ # Allocator warmup
156
+ gr.HTML('<h2 id="allocator" class="section">Transformers allocator warmup: time vs allocated MiB</h2>')
157
+ with gr.Group():
158
  model_mem = gr.Dropdown(
159
  label="Model",
160
+ choices=["openai-community/gpt2", "google/gemma-2-2b"],
161
  value="openai-community/gpt2",
162
  allow_custom_value=True,
163
  )
164
+ go_mem = gr.Button("Run")
165
+ plot = gr.LinePlot(
166
+ x="t", y="MiB", color="mode", overlay_point=True,
167
+ title="from_pretrained() load: time vs CUDA memory_allocated()",
168
+ tooltip=["t", "MiB", "mode"], width=900, height=420
169
+ )
170
+ go_mem.click(profile_warmup, inputs=[model_mem], outputs=plot)
171
+ gr.HTML("<hr/>")
172
+
173
+ # FastRTC preview
174
+ gr.HTML('<h2 id="rtc" class="section">FastRTC (preview)</h2>')
175
+ if HAS_FASTRTC:
176
+ with gr.Group():
177
+ gr.Markdown("Camera loopback using FastRTC WebRTC. Extend with streaming handlers later.")
178
+ rtc = WebRTC(mode="send-receive", modality="video")
179
+ rtc.stream(ReplyOnPause(_echo_video), inputs=[rtc], outputs=[rtc], time_limit=60)
180
+ else:
181
+ gr.Markdown("Install `fastrtc` to enable this section.")
182
 
183
  if __name__ == "__main__":
184
  demo.launch()