Spaces:

transformers-community
/

Transformers-tenets

Running

App Files Files Community

Molbap HF Staff commited on Aug 19

Commit

67910fd

1 Parent(s): 0ce686a

anchors

Browse files

Files changed (1) hide show

app.py +164 -175

app.py CHANGED Viewed

@@ -1,195 +1,184 @@
-import os, json, subprocess, sys, textwrap, tempfile, shlex, pandas as pd
 import gradio as gr
 import spaces
 # --- Attention mask visualizer (Transformers) ---
-# Docs show: from transformers.utils.attention_visualizer import AttentionMaskVisualizer
-# Ref: https://huggingface.co/docs/transformers/... pages mention this util.
-def _import_visualizer():
-    from transformers.utils.attention_visualizer import AttentionMaskVisualizer  # type: ignore[attr-defined]
     return AttentionMaskVisualizer
 @spaces.GPU(duration=120)
-def run_attention_visualizer(model_id: str, prompt: str) -> str:
-    """
-    Returns HTML produced by AttentionMaskVisualizer(model_id)(prompt).
-    We render it into an HTML component.
-    """
-    AttentionMaskVisualizer = _import_visualizer()
     vis = AttentionMaskVisualizer(model_id)
-    html_or_obj = vis(prompt)  # recent Transformers returns embeddable HTML
-    return str(html_or_obj)
-# --- Minimal “terminal” (sandboxed) ---
-def run_shell(cmd: str) -> str:
-    # Simple, constrained shell: block backgrounding, pipes, redirects; allow common tooling.
-    blocked = any(tok in cmd for tok in ["|", ">", "<", "&&", "||", "`"])
-    if blocked:
-        return "Blocked characters detected. Use a single command without pipes/redirections."
-    try:
-        out = subprocess.run(
-            cmd, shell=True, check=False, capture_output=True, text=True, timeout=30
-        )
-        return f"$ {cmd}\n{out.stdout}{out.stderr}"
-    except Exception as e:
-        return f"$ {cmd}\n{e!r}"
-# --- KV-cache / CUDA caching allocator profiling ---
-# We launch a short Python program twice (allocator on/off) in a subprocess so the env var takes effect pre-import.
-PROFILE_SNIPPET = r"""
-import os, json, time, torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-model_id = os.environ.get("HF_MODEL_ID", "openai-community/gpt2")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-tok = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else None).to(device)
-prompt = os.environ.get("HF_PROMPT", "Transformers are great for sequence modeling.")
-steps = int(os.environ.get("HF_STEPS", "32"))
-inputs = tok(prompt, return_tensors="pt").to(device)
-if device == "cuda":
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.synchronize()
-def mem():
-    if device != "cuda":
-        return {"allocated": 0, "reserved": 0}
-    return {
-        "allocated": int(torch.cuda.memory_allocated()),
-        "reserved": int(torch.cuda.memory_reserved()),
-    }
-print(json.dumps({"t": 0, **mem()}), flush=True)
-# Step-by-step generation to grow KV cache
-past = None
-input_ids = inputs.input_ids
-for i in range(1, steps+1):
-    with torch.inference_mode():
-        out = model(input_ids=input_ids, use_cache=True, past_key_values=past)
-        past = out.past_key_values
-        # feed a single token next (use eos or last predicted token if available)
-        next_id = torch.tensor([[tok.eos_token_id or tok.pad_token_id or 0]], device=device)
-        input_ids = next_id
-    if device == "cuda":
-        torch.cuda.synchronize()
-    print(json.dumps({"t": i, **mem()}), flush=True)
-"""
-def _run_profile_once(model_id: str, prompt: str, steps: int, disable_cache: bool) -> list[dict]:
-    env = os.environ.copy()
-    env["HF_MODEL_ID"] = model_id
-    env["HF_PROMPT"] = prompt
-    env["HF_STEPS"] = str(steps)
-    # IMPORTANT: set before torch import in the child
-    if disable_cache:
-        env["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
-    with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
-        f.write(PROFILE_SNIPPET)
-        path = f.name
     try:
-        p = subprocess.Popen(
-            [sys.executable, path],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            env=env,
-        )
-        out_lines = []
-        assert p.stdout is not None
-        for line in p.stdout:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                out_lines.append(json.loads(line))
-            except json.JSONDecodeError:
-                # ignore stray prints from HF / torch
-                pass
-        p.wait(timeout=300)
-        return out_lines
     finally:
-        try:
-            os.remove(path)
-        except OSError:
-            pass
-@spaces.GPU(duration=180)
-def profile_allocator(model_id: str, prompt: str, steps: int):
-    """Return a DataFrame ready for gr.LinePlot: t, MiB, kind, mode."""
-    on = _run_profile_once(model_id, prompt, steps, disable_cache=False)
-    off = _run_profile_once(model_id, prompt, steps, disable_cache=True)
-    def rows(series, mode):
-        for rec in series:
-            t = rec.get("t", 0)
-            allocated = rec.get("allocated", 0) / (1024**2)
-            reserved  = rec.get("reserved", 0) / (1024**2)
-            yield {"t": t, "MiB": allocated, "kind": "allocated", "mode": mode}
-            yield {"t": t, "MiB": reserved,  "kind": "reserved",  "mode": mode}
-    df = pd.DataFrame(list(rows(on, "caching ON")) + list(rows(off, "caching OFF")))
-    return df
-# --- UI ---
-with gr.Blocks(fill_height=True) as demo:
-    gr.Markdown(
-        textwrap.dedent("""
-        ### Transformers feature showcase (ZeroGPU-ready)
-        - Attention mask visualizer
-        - Minimal terminal
-        - KV cache vs. CUDA caching allocator memory plot
-        """).strip()
-    )
-    with gr.Tabs():
-        with gr.Tab("Attention mask visualizer"):
-            with gr.Row():
-                model_dd = gr.Dropdown(
-                    label="Model",
-                    choices=[
-                        "openai-community/gpt2",
-                        "google/gemma-2-2b",  # heavier; try if bandwidth allows
-                    ],
-                    value="openai-community/gpt2",
-                    allow_custom_value=True,
-                )
-                prompt_tb = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
-                run_btn = gr.Button("Render")
-            html_out = gr.HTML()
-            run_btn.click(run_attention_visualizer, inputs=[model_dd, prompt_tb], outputs=html_out)
-        with gr.Tab("Terminal (simplified)"):
-            cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
-            run_b = gr.Button("Run")
-            out = gr.Textbox(label="Output", lines=18, interactive=False)
-            run_b.click(run_shell, inputs=cmd, outputs=out)
-        with gr.Tab("Cache allocator plot"):
-            with gr.Row():
                 model_mem = gr.Dropdown(
                     label="Model",
-                    choices=["openai-community/gpt2"],
                     value="openai-community/gpt2",
                     allow_custom_value=True,
                 )
-                prompt_mem = gr.Textbox(label="Prompt", value="A short test prompt.")
-                steps = gr.Slider(8, 128, value=32, step=1, label="Steps (tokens)")
-                go = gr.Button("Profile")
-            df_out = gr.Dataframe(visible=False)  # optional debugging
-            plot = gr.LinePlot(
-                x="t", y="MiB", color="mode", overlay_point=True,
-                title="GPU memory over steps (allocated vs reserved; caching ON vs OFF)",
-                group="kind", tooltip=["t", "MiB", "kind", "mode"], width=900, height=450
-            )
-            go.click(profile_allocator, inputs=[model_mem, prompt_mem, steps], outputs=plot)
-        # Placeholder for a future FastRTC tab; the Space structure supports it.
-        # See: https://www.gradio.app/guides/create-immersive-demo (WebRTC + Stream with FastRTC)
 if __name__ == "__main__":
     demo.launch()

+import os, sys, time, threading, subprocess, json, textwrap, tempfile
 import gradio as gr
+import pandas as pd
 import spaces
+import torch
+# --- Minimal safe terminal ---
+def run_shell(cmd: str) -> str:
+    banned = ["|", ">", "<", "&&", "||", "`"]
+    if any(b in cmd for b in banned):
+        return "$ " + cmd + "\nBlocked characters. Use a single command."
+    try:
+        p = subprocess.run(cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
+        return f"$ {cmd}\n{p.stdout}{p.stderr}"
+    except Exception as e:
+        return f"$ {cmd}\n{e!r}"
 # --- Attention mask visualizer (Transformers) ---
+def _import_attention_visualizer():
+    # Available in recent transformers (utils.attention_visualizer)
+    from transformers.utils.attention_visualizer import AttentionMaskVisualizer  # noqa: F401
     return AttentionMaskVisualizer
 @spaces.GPU(duration=120)
+def render_attention_mask(model_id: str, prompt: str) -> str:
+    AttentionMaskVisualizer = _import_attention_visualizer()
     vis = AttentionMaskVisualizer(model_id)
+    out = vis(prompt)          # returns embeddable HTML or an object with _repr_html_
+    return str(out)
+# --- Transformers caching allocator warmup: time vs memory_allocated() ---
+from transformers import AutoModelForCausalLM, modeling_utils as MU  # noqa: E402
+def _measure_load_timeline(model_id: str, disable_warmup: bool):
+    orig = MU.caching_allocator_warmup
+    if disable_warmup:
+        MU.caching_allocator_warmup = lambda *a, **k: None
     try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tl = []
+        def sample(start_t, stop_evt):
+            while not stop_evt.is_set():
+                if device == "cuda":
+                    torch.cuda.synchronize()
+                    alloc = torch.cuda.memory_allocated()
+                else:
+                    alloc = 0
+                tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
+                time.sleep(0.05)
+        if device == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+        start = time.perf_counter()
+        stop_evt = threading.Event()
+        th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
+        th.start()
+        kwargs = {}
+        if device == "cuda":
+            kwargs.update(dict(torch_dtype=torch.float16, device_map="cuda:0", low_cpu_mem_usage=True))
+        model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+        stop_evt.set()
+        th.join()
+        if device == "cuda":
+            torch.cuda.synchronize()
+            tl.append({"t": time.perf_counter() - start, "MiB": torch.cuda.memory_allocated() / (1024**2)})
+        del model
+        if device == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        return tl
     finally:
+        MU.caching_allocator_warmup = orig
+@spaces.GPU(duration=240)
+def profile_warmup(model_id: str):
+    on  = _measure_load_timeline(model_id, disable_warmup=False)
+    off = _measure_load_timeline(model_id, disable_warmup=True)
+    rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup ON"} for r in on] + \
+           [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup OFF"} for r in off]
+    return pd.DataFrame(rows)
+# --- (Optional) FastRTC demo: simple loopback for structure; expand later ---
+# Requires camera permissions in the browser.
+try:
+    from fastrtc import WebRTC, ReplyOnPause  # type: ignore
+    def _echo_video(frame):
+        yield frame
+    HAS_FASTRTC = True
+except Exception:
+    HAS_FASTRTC = False
+# --- CSS for anchored, scrollable “playbook” layout ---
+CSS = """
+:root { --toc-w: 280px; }
+#layout { display: grid; grid-template-columns: var(--toc-w) 1fr; gap: 1.25rem; }
+#toc { position: sticky; top: 0.75rem; height: calc(100vh - 1.5rem); overflow: auto; padding-right: .5rem; }
+#toc a { text-decoration: none; display: block; padding: .25rem 0; }
+.section { scroll-margin-top: 72px; }
+.gradio-container { max-width: 1200px !important; margin: 0 auto; }
+hr { border: none; border-top: 1px solid var(--neutral-300); margin: 1.25rem 0; }
+"""
+with gr.Blocks(css=CSS, fill_height=True, title="Transformers Feature Showcase (ZeroGPU)") as demo:
+    gr.HTML("<h1>Transformers Feature Showcase</h1><p>Interactive, scrollable demo.</p>")
+    with gr.Row(elem_id="layout"):
+        # TOC
+        with gr.Column(scale=0):
+            gr.HTML(
+                """
+                <nav id="toc">
+                  <h3>Sections</h3>
+                  <a href="#terminal">Terminal</a>
+                  <a href="#attention">Attention mask visualizer</a>
+                  <a href="#allocator">Allocator warmup timeline</a>
+                  <a href="#rtc">FastRTC (preview)</a>
+                </nav>
+                """
+            )
+        # Content
+        with gr.Column():
+            # Terminal
+            gr.HTML('<h2 id="terminal" class="section">Terminal</h2>')
+            with gr.Group():
+                cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'")
+                run_btn = gr.Button("Run")
+                out = gr.Textbox(label="Output", lines=12)
+                run_btn.click(run_shell, inputs=cmd, outputs=out)
+            gr.HTML("<hr/>")
+            # Attention visualizer
+            gr.HTML('<h2 id="attention" class="section">Attention mask visualizer</h2>')
+            with gr.Group():
+                with gr.Row():
+                    model_vis = gr.Dropdown(
+                        label="Model",
+                        choices=["openai-community/gpt2", "google/gemma-2-2b"],
+                        value="openai-community/gpt2",
+                        allow_custom_value=True,
+                    )
+                    prompt_vis = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.")
+                    go_vis = gr.Button("Render")
+                html_vis = gr.HTML()
+                go_vis.click(render_attention_mask, inputs=[model_vis, prompt_vis], outputs=html_vis)
+            gr.HTML("<hr/>")
+            # Allocator warmup
+            gr.HTML('<h2 id="allocator" class="section">Transformers allocator warmup: time vs allocated MiB</h2>')
+            with gr.Group():
                 model_mem = gr.Dropdown(
                     label="Model",
+                    choices=["openai-community/gpt2", "google/gemma-2-2b"],
                     value="openai-community/gpt2",
                     allow_custom_value=True,
                 )
+                go_mem = gr.Button("Run")
+                plot = gr.LinePlot(
+                    x="t", y="MiB", color="mode", overlay_point=True,
+                    title="from_pretrained() load: time vs CUDA memory_allocated()",
+                    tooltip=["t", "MiB", "mode"], width=900, height=420
+                )
+                go_mem.click(profile_warmup, inputs=[model_mem], outputs=plot)
+            gr.HTML("<hr/>")
+            # FastRTC preview
+            gr.HTML('<h2 id="rtc" class="section">FastRTC (preview)</h2>')
+            if HAS_FASTRTC:
+                with gr.Group():
+                    gr.Markdown("Camera loopback using FastRTC WebRTC. Extend with streaming handlers later.")
+                    rtc = WebRTC(mode="send-receive", modality="video")
+                    rtc.stream(ReplyOnPause(_echo_video), inputs=[rtc], outputs=[rtc], time_limit=60)
+            else:
+                gr.Markdown("Install `fastrtc` to enable this section.")
 if __name__ == "__main__":
     demo.launch()