Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

3e64a53

verified ·

1 Parent(s): 18a6809

update app

Browse files

Files changed (1) hide show

app.py +1306 -339

app.py CHANGED Viewed

@@ -1,214 +1,35 @@
 import os
 import random
 import uuid
-import json
-import requests
 import time
-import asyncio
 from threading import Thread
-from typing import Iterable
 import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
     AutoProcessor,
-    AutoTokenizer,
     TextIteratorStreamer,
 )
-from gradio.themes import Soft
-from gradio.themes.utils import colors, fonts, sizes
-colors.steel_blue = colors.Color(
-    name="steel_blue",
-    c50="#EBF3F8",
-    c100="#D3E5F0",
-    c200="#A8CCE1",
-    c300="#7DB3D2",
-    c400="#529AC3",
-    c500="#4682B4",
-    c600="#3E72A0",
-    c700="#36638C",
-    c800="#2E5378",
-    c900="#264364",
-    c950="#1E3450",
-)
-class SteelBlueTheme(Soft):
-    def __init__(
-        self,
-        *,
-        primary_hue: colors.Color | str = colors.gray,
-        secondary_hue: colors.Color | str = colors.steel_blue,
-        neutral_hue: colors.Color | str = colors.slate,
-        text_size: sizes.Size | str = sizes.text_lg,
-        font: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
-        ),
-        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
-        ),
-    ):
-        super().__init__(
-            primary_hue=primary_hue,
-            secondary_hue=secondary_hue,
-            neutral_hue=neutral_hue,
-            text_size=text_size,
-            font=font,
-            font_mono=font_mono,
-        )
-        super().set(
-            background_fill_primary="*primary_50",
-            background_fill_primary_dark="*primary_900",
-            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
-            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
-            button_primary_text_color="white",
-            button_primary_text_color_hover="white",
-            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
-            slider_color="*secondary_500",
-            slider_color_dark="*secondary_600",
-            block_title_text_weight="600",
-            block_border_width="3px",
-            block_shadow="*shadow_drop_lg",
-            button_primary_shadow="*shadow_drop_lg",
-            button_large_padding="11px",
-            color_accent_soft="*primary_100",
-            block_label_background_fill="*primary_200",
-        )
-steel_blue_theme = SteelBlueTheme()
-css = """
-#main-title h1 {
-    font-size: 2.3em !important;
-}
-#output-title h2 {
-    font-size: 2.2em !important;
-}
-/* RadioAnimated Styles */
-.ra-wrap{ width: fit-content; }
-.ra-inner{
-  position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
-  background: var(--neutral-200); border-radius: 9999px; overflow: hidden;
-}
-.ra-input{ display: none; }
-.ra-label{
-  position: relative; z-index: 2; padding: 8px 16px;
-  font-family: inherit; font-size: 14px; font-weight: 600;
-  color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap;
-}
-.ra-highlight{
-  position: absolute; z-index: 1; top: 6px; left: 6px;
-  height: calc(100% - 12px); border-radius: 9999px;
-  background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-  transition: transform 0.2s, width 0.2s;
-}
-.ra-input:checked + .ra-label{ color: black; }
-/* Dark mode adjustments for Radio */
-.dark .ra-inner { background: var(--neutral-800); }
-.dark .ra-label { color: var(--neutral-400); }
-.dark .ra-highlight { background: var(--neutral-600); }
-.dark .ra-input:checked + .ra-label { color: white; }
-#gpu-duration-container {
-    padding: 10px;
-    border-radius: 8px;
-    background: var(--background-fill-secondary);
-    border: 1px solid var(--border-color-primary);
-    margin-top: 10px;
-}
-"""
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-class RadioAnimated(gr.HTML):
-    def __init__(self, choices, value=None, **kwargs):
-        if not choices or len(choices) < 2:
-            raise ValueError("RadioAnimated requires at least 2 choices.")
-        if value is None:
-            value = choices[0]
-        uid = uuid.uuid4().hex[:8]
-        group_name = f"ra-{uid}"
-        inputs_html = "\n".join(
-            f"""
-            <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
-            <label class="ra-label" for="{group_name}-{i}">{c}</label>
-            """
-            for i, c in enumerate(choices)
-        )
-        html_template = f"""
-        <div class="ra-wrap" data-ra="{uid}">
-          <div class="ra-inner">
-            <div class="ra-highlight"></div>
-            {inputs_html}
-          </div>
-        </div>
-        """
-        js_on_load = r"""
-        (() => {
-          const wrap = element.querySelector('.ra-wrap');
-          const inner = element.querySelector('.ra-inner');
-          const highlight = element.querySelector('.ra-highlight');
-          const inputs = Array.from(element.querySelectorAll('.ra-input'));
-          if (!inputs.length) return;
-          const choices = inputs.map(i => i.value);
-          function setHighlightByIndex(idx) {
-            const n = choices.length;
-            const pct = 100 / n;
-            highlight.style.width = `calc(${pct}% - 6px)`;
-            highlight.style.transform = `translateX(${idx * 100}%)`;
-          }
-          function setCheckedByValue(val, shouldTrigger=false) {
-            const idx = Math.max(0, choices.indexOf(val));
-            inputs.forEach((inp, i) => { inp.checked = (i === idx); });
-            setHighlightByIndex(idx);
-            props.value = choices[idx];
-            if (shouldTrigger) trigger('change', props.value);
-          }
-          setCheckedByValue(props.value ?? choices[0], false);
-          inputs.forEach((inp) => {
-            inp.addEventListener('change', () => {
-              setCheckedByValue(inp.value, true);
-            });
-          });
-        })();
-        """
-        super().__init__(
-            value=value,
-            html_template=html_template,
-            js_on_load=js_on_load,
-            **kwargs
-        )
-def apply_gpu_duration(val: str):
-    return int(val)
 MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
@@ -255,18 +76,175 @@ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
-    """
-    Downsamples the video to evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
     for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -276,53 +254,33 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-def calc_timeout_image(model_name: str, text: str, image: Image.Image,
-                       max_new_tokens: int, temperature: float, top_p: float,
-                       top_k: int, repetition_penalty: float, gpu_timeout: int):
-    """Calculate GPU timeout duration for image inference."""
     try:
         return int(gpu_timeout)
-    except:
         return 60
-def calc_timeout_video(model_name: str, text: str, video_path: str,
-                       max_new_tokens: int, temperature: float, top_p: float,
-                       top_k: int, repetition_penalty: float, gpu_timeout: int):
-    """Calculate GPU timeout duration for video inference."""
     try:
         return int(gpu_timeout)
-    except:
         return 60
-@spaces.GPU(duration=calc_timeout_image)
-def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2,
-                   gpu_timeout: int = 60):
-    """
-    Generates responses using the selected model for image input.
-    Yields raw text and Markdown-formatted text.
-    """
-    if model_name == "SkyCaptioner-V1":
-        processor, model = processor_m, model_m
-    elif model_name == "DeepCaption-VLA-7B":
-        processor, model = processor_n, model_n
-    elif model_name == "SpaceThinker-3B":
-        processor, model = processor_z, model_z
-    elif model_name == "coreOCR-7B-050325-preview":
-        processor, model = processor_k, model_k
-    elif model_name == "SpaceOm-3B":
-        processor, model = processor_y, model_y
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
     if image is None:
-        yield "Please upload an image.", "Please upload an image."
-        return
     messages = [{
         "role": "user",
@@ -331,6 +289,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
@@ -340,48 +299,49 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, buffer
 @spaces.GPU(duration=calc_timeout_video)
-def generate_video(model_name: str, text: str, video_path: str,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2,
-                   gpu_timeout: int = 90):
-    """
-    Generates responses using the selected model for video input.
-    Yields raw text and Markdown-formatted text.
-    """
-    if model_name == "SkyCaptioner-V1":
-        processor, model = processor_m, model_m
-    elif model_name == "DeepCaption-VLA-7B":
-        processor, model = processor_n, model_n
-    elif model_name == "SpaceThinker-3B":
-        processor, model = processor_z, model_z
-    elif model_name == "coreOCR-7B-050325-preview":
-        processor, model = processor_k, model_k
-    elif model_name == "SpaceOm-3B":
-        processor, model = processor_y, model_y
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    if video_path is None:
-        yield "Please upload a video.", "Please upload a video."
-        return
     frames = downsample_video(video_path)
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -390,6 +350,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -399,100 +360,1106 @@ def generate_video(model_name: str, text: str, video_path: str,
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
         "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, buffer
-image_examples = [
-    ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
-    ["count the number of birds and explain the scene in detail.", "images/2.jpeg"],
-    ["how far is the Goal from the penalty taker in this image?.", "images/3.png"],
-    ["approximately how many meters apart are the chair and bookshelf?.", "images/4.png"],
-    ["how far is the man in the red hat from the pallet of boxes in feet?.", "images/5.jpg"],
-]
-video_examples = [
-    ["give the highlights of the movie scene video.", "videos/1.mp4"],
-    ["explain the advertisement in detail.", "videos/2.mp4"]
-]
 with gr.Blocks() as demo:
-    gr.Markdown("# **VisionScope R2**", elem_id="main-title")
-    with gr.Row():
-        with gr.Column(scale=2):
-            with gr.Tabs():
-                with gr.TabItem("Image Inference"):
-                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    image_upload = gr.Image(type="pil", label="Upload Image", height=290)
-                    image_submit = gr.Button("Submit", variant="primary")
-                    gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
-                with gr.TabItem("Video Inference"):
-                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    video_upload = gr.Video(label="Upload Video(<= 30s)", height=290)
-                    video_submit = gr.Button("Submit", variant="primary")
-                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
-            with gr.Accordion("Advanced options", open=False):
-                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        with gr.Column(scale=3):
-            gr.Markdown("## Output", elem_id="output-title")
-            output = gr.Textbox(label="Raw Output Stream", interactive=True, lines=11)
-            with gr.Accordion("(Result.md)", open=False):
-                markdown_output = gr.Markdown(label="Formatted Result")
-            model_choice = gr.Radio(
-                choices=["DeepCaption-VLA-7B", "SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
-                label="Select Model",
-                value="DeepCaption-VLA-7B"
             )
-            with gr.Row(elem_id="gpu-duration-container"):
-                with gr.Column():
-                    gr.Markdown("**GPU Duration (seconds)**")
-                    radioanimated_gpu_duration = RadioAnimated(
-                        choices=["60", "90", "120", "180", "240", "300"],
-                        value="60",
-                        elem_id="radioanimated_gpu_duration"
-                    )
-                    gpu_duration_state = gr.Number(value=60, visible=False)
-            gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
-    radioanimated_gpu_duration.change(
-        fn=apply_gpu_duration,
-        inputs=radioanimated_gpu_duration,
-        outputs=[gpu_duration_state],
-        api_visibility="private"
-    )
-    image_submit.click(
-        fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
-        outputs=[output, markdown_output]
     )
-    video_submit.click(
-        fn=generate_video,
-        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
-        outputs=[output, markdown_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)

 import os
+import gc
+import re
+import ast
+import json
+import base64
 import random
 import uuid
 import time
+from io import BytesIO
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
 import numpy as np
+from PIL import Image, ImageOps
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
 MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
+MODEL_MAP = {
+    "DeepCaption-VLA-7B": (processor_n, model_n),
+    "SkyCaptioner-V1": (processor_m, model_m),
+    "SpaceThinker-3B": (processor_z, model_z),
+    "coreOCR-7B-050325-preview": (processor_k, model_k),
+    "SpaceOm-3B": (processor_y, model_y),
+}
+MODEL_CHOICES = list(MODEL_MAP.keys())
+image_examples = [
+    {"query": "type out the messy hand-writing as accurately as you can.", "image": "images/1.jpg", "model": "coreOCR-7B-050325-preview"},
+    {"query": "count the number of birds and explain the scene in detail.", "image": "images/2.jpeg", "model": "DeepCaption-VLA-7B"},
+    {"query": "how far is the Goal from the penalty taker in this image?.", "image": "images/3.png", "model": "SpaceThinker-3B"},
+    {"query": "approximately how many meters apart are the chair and bookshelf?.", "image": "images/4.png", "model": "SkyCaptioner-V1"},
+    {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "image": "images/5.jpg", "model": "SpaceOm-3B"},
+]
+video_examples = [
+    {"query": "give the highlights of the movie scene video.", "video": "videos/1.mp4", "model": "DeepCaption-VLA-7B"},
+    {"query": "explain the advertisement in detail.", "video": "videos/2.mp4", "model": "SkyCaptioner-V1"},
+]
+def pil_to_data_url(img: Image.Image, fmt="PNG"):
+    buf = BytesIO()
+    img.save(buf, format=fmt)
+    data = base64.b64encode(buf.getvalue()).decode()
+    mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
+    return f"data:{mime};base64,{data}"
+def file_to_data_url(path):
+    if not os.path.exists(path):
+        return ""
+    ext = path.rsplit(".", 1)[-1].lower()
+    mime = {
+        "jpg": "image/jpeg",
+        "jpeg": "image/jpeg",
+        "png": "image/png",
+        "webp": "image/webp",
+        "mp4": "video/mp4",
+        "mov": "video/quicktime",
+        "webm": "video/webm",
+        "mkv": "video/x-matroska",
+    }.get(ext, "application/octet-stream")
+    with open(path, "rb") as f:
+        data = base64.b64encode(f.read()).decode()
+    return f"data:{mime};base64,{data}"
+def make_thumb_b64(path, max_dim=240):
+    try:
+        img = Image.open(path).convert("RGB")
+        img.thumbnail((max_dim, max_dim))
+        return pil_to_data_url(img, "JPEG")
+    except Exception as e:
+        print("Thumbnail error:", e)
+        return ""
+def make_video_thumb_b64(path, max_dim=240):
+    try:
+        cap = cv2.VideoCapture(path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        target = max(0, total_frames // 2)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target)
+        success, frame = cap.read()
+        cap.release()
+        if not success:
+            return ""
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        img = Image.fromarray(frame).convert("RGB")
+        img.thumbnail((max_dim, max_dim))
+        return pil_to_data_url(img, "JPEG")
+    except Exception as e:
+        print("Video thumbnail error:", e)
+        return ""
+def build_example_cards_html():
+    cards = ""
+    for i, ex in enumerate(image_examples):
+        thumb = make_thumb_b64(ex["image"])
+        prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
+        cards += f"""
+        <div class="example-card" data-kind="image" data-idx="{i}">
+            <div class="example-thumb-wrap">
+                {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
+            </div>
+            <div class="example-meta-row">
+                <span class="example-badge">{ex["model"]}</span>
+                <span class="example-badge kind">IMAGE</span>
+            </div>
+            <div class="example-prompt-text">{prompt_short}</div>
+        </div>
+        """
+    for i, ex in enumerate(video_examples):
+        thumb = make_video_thumb_b64(ex["video"])
+        prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
+        cards += f"""
+        <div class="example-card" data-kind="video" data-idx="{i}">
+            <div class="example-thumb-wrap">
+                {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Video</div>"}
+            </div>
+            <div class="example-meta-row">
+                <span class="example-badge">{ex["model"]}</span>
+                <span class="example-badge kind video">VIDEO</span>
+            </div>
+            <div class="example-prompt-text">{prompt_short}</div>
+        </div>
+        """
+    return cards
+EXAMPLE_CARDS_HTML = build_example_cards_html()
+def load_example_data(kind, idx_str):
+    try:
+        idx = int(float(idx_str))
+    except Exception:
+        return json.dumps({"status": "error", "message": "Invalid example index"})
+    if kind == "image":
+        if idx < 0 or idx >= len(image_examples):
+            return json.dumps({"status": "error", "message": "Example index out of range"})
+        ex = image_examples[idx]
+        img_b64 = file_to_data_url(ex["image"])
+        if not img_b64:
+            return json.dumps({"status": "error", "message": "Could not load example image"})
+        return json.dumps({
+            "status": "ok",
+            "kind": "image",
+            "query": ex["query"],
+            "file": img_b64,
+            "model": ex["model"],
+            "name": os.path.basename(ex["image"]),
+        })
+    if kind == "video":
+        if idx < 0 or idx >= len(video_examples):
+            return json.dumps({"status": "error", "message": "Example index out of range"})
+        ex = video_examples[idx]
+        vid_b64 = file_to_data_url(ex["video"])
+        if not vid_b64:
+            return json.dumps({"status": "error", "message": "Could not load example video"})
+        return json.dumps({
+            "status": "ok",
+            "kind": "video",
+            "query": ex["query"],
+            "file": vid_b64,
+            "model": ex["model"],
+            "name": os.path.basename(ex["video"]),
+        })
+    return json.dumps({"status": "error", "message": "Invalid example kind"})
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
     frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
     for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
         success, image = vidcap.read()
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     vidcap.release()
     return frames
+def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
+    except Exception:
         return 60
+def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
+    except Exception:
         return 60
+@spaces.GPU(duration=calc_timeout_image)
+def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
+    if not model_name or model_name not in MODEL_MAP:
+        raise gr.Error("Please select a valid model.")
     if image is None:
+        raise gr.Error("Please upload an image.")
+    if not text or not str(text).strip():
+        raise gr.Error("Please enter your vision/query instruction.")
+    if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
+        raise gr.Error("Query is too long. Please shorten your input.")
+    processor, model = MODEL_MAP[model_name]
     messages = [{
         "role": "user",
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": int(max_new_tokens),
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "top_k": int(top_k),
+        "repetition_penalty": float(repetition_penalty),
+        "do_sample": True,
+    }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 @spaces.GPU(duration=calc_timeout_video)
+def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
+    if not model_name or model_name not in MODEL_MAP:
+        raise gr.Error("Please select a valid model.")
+    if not video_path:
+        raise gr.Error("Please upload a video.")
+    if not text or not str(text).strip():
+        raise gr.Error("Please enter your vision/query instruction.")
+    if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
+        raise gr.Error("Query is too long. Please shorten your input.")
+    processor, model = MODEL_MAP[model_name]
     frames = downsample_video(video_path)
+    if not frames:
+        raise gr.Error("Failed to read video frames.")
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
+        "max_new_tokens": int(max_new_tokens),
         "do_sample": True,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "top_k": int(top_k),
+        "repetition_penalty": float(repetition_penalty),
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def noop():
+    return None
+css = r"""
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
+*{box-sizing:border-box;margin:0;padding:0}
+html,body{height:100%;overflow-x:hidden}
+body,.gradio-container{
+    background:#0b1020!important;
+    font-family:'Inter',system-ui,-apple-system,sans-serif!important;
+    font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
+}
+.dark body,.dark .gradio-container{background:#0b1020!important;color:#e4e4e7!important}
+footer{display:none!important}
+.hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
+#gradio-run-btn,#example-load-btn{
+    position:absolute!important;left:-9999px!important;top:-9999px!important;
+    width:1px!important;height:1px!important;opacity:0.01!important;
+    pointer-events:none!important;overflow:hidden!important;
+}
+.app-shell{
+    background:#11182d;border:1px solid #1e2b52;border-radius:16px;
+    margin:12px auto;max-width:1400px;overflow:hidden;
+    box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
+}
+.app-header{
+    background:linear-gradient(135deg,#11182d,#152042);border-bottom:1px solid #1e2b52;
+    padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
+}
+.app-header-left{display:flex;align-items:center;gap:12px}
+.app-logo{
+    width:38px;height:38px;background:linear-gradient(135deg,#0000FF,#2e5bff,#6d8dff);
+    border-radius:10px;display:flex;align-items:center;justify-content:center;
+    box-shadow:0 4px 12px rgba(0,0,255,.35);
+}
+.app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
+.app-title{
+    font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#b8c5ff);
+    -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
+}
+.app-badge{
+    font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
+    background:rgba(0,0,255,.12);color:#8ea2ff;border:1px solid rgba(0,0,255,.25);letter-spacing:.3px;
+}
+.app-badge.fast{background:rgba(46,91,255,.10);color:#93a7ff;border:1px solid rgba(46,91,255,.22)}
+.mode-tabs-bar,.model-tabs-bar{
+    background:#11182d;border-bottom:1px solid #1e2b52;padding:10px 16px;
+    display:flex;gap:8px;align-items:center;flex-wrap:wrap;
+}
+.model-tab,.mode-tab{
+    display:inline-flex;align-items:center;justify-content:center;gap:6px;
+    min-width:32px;height:34px;background:transparent;border:1px solid #243669;
+    border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
+    color:#ffffff!important;transition:all .15s ease;
+}
+.model-tab:hover,.mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
+.model-tab.active,.mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
+.model-tab-label,.mode-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
+.app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
+.app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #1e2b52}
+.app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#11182d}
+#media-drop-zone{
+    position:relative;background:#08101d;height:440px;min-height:440px;max-height:440px;
+    overflow:hidden;
+}
+#media-drop-zone.drag-over{outline:2px solid #0000FF;outline-offset:-2px;background:rgba(0,0,255,.04)}
+.upload-prompt-modern{
+    position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
+    padding:20px;z-index:20;overflow:hidden;
+}
+.upload-click-area{
+    display:flex;flex-direction:column;align-items:center;justify-content:center;
+    cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
+    border:2px dashed #32446d;border-radius:16px;
+    background:rgba(0,0,255,.03);transition:all .2s ease;gap:8px;text-align:center;
+    overflow:hidden;
+}
+.upload-click-area:hover{background:rgba(0,0,255,.08);border-color:#0000FF;transform:scale(1.02)}
+.upload-click-area:active{background:rgba(0,0,255,.12);transform:scale(.99)}
+.upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0}
+.upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}
+.upload-sub-text{color:#71717a;font-size:12px}
+.single-preview-wrap{
+    width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
+    overflow:hidden;
+}
+.single-preview-card{
+    width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
+    overflow:hidden;border:1px solid #1e2b52;background:#0d1425;
+    display:flex;align-items:center;justify-content:center;position:relative;
+}
+.single-preview-card img,.single-preview-card video{
+    width:100%;height:100%;max-width:100%;max-height:100%;
+    object-fit:contain;display:block;background:#000;
+}
+.preview-overlay-actions{
+    position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
+}
+.preview-action-btn{
+    display:inline-flex;align-items:center;justify-content:center;
+    min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
+    border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
+    color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
+}
+.preview-action-btn:hover{background:#0000FF;border-color:#0000FF}
+.hint-bar{
+    background:rgba(0,0,255,.06);border-top:1px solid #1e2b52;border-bottom:1px solid #1e2b52;
+    padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
+}
+.hint-bar b{color:#8ea2ff;font-weight:600}
+.hint-bar kbd{
+    display:inline-block;padding:1px 6px;background:#1b2646;border:1px solid #2d3b6d;
+    border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
+}
+.examples-section{border-top:1px solid #1e2b52;padding:12px 16px}
+.examples-title{
+    font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
+    letter-spacing:.8px;margin-bottom:10px;
+}
+.examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
+.examples-scroll::-webkit-scrollbar{height:6px}
+.examples-scroll::-webkit-scrollbar-track{background:#08101d;border-radius:3px}
+.examples-scroll::-webkit-scrollbar-thumb{background:#243669;border-radius:3px}
+.examples-scroll::-webkit-scrollbar-thumb:hover{background:#38529a}
+.example-card{
+    flex-shrink:0;width:220px;background:#08101d;border:1px solid #1e2b52;
+    border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
+}
+.example-card:hover{border-color:#0000FF;transform:translateY(-2px);box-shadow:0 4px 12px rgba(0,0,255,.15)}
+.example-card.loading{opacity:.5;pointer-events:none}
+.example-thumb-wrap{height:120px;overflow:hidden;background:#11182d}
+.example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
+.example-thumb-placeholder{
+    width:100%;height:100%;display:flex;align-items:center;justify-content:center;
+    background:#11182d;color:#3f4e78;font-size:11px;
+}
+.example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px;flex-wrap:wrap}
+.example-badge{
+    display:inline-flex;padding:2px 7px;background:rgba(0,0,255,.12);border-radius:4px;
+    font-size:10px;font-weight:600;color:#93a7ff;font-family:'JetBrains Mono',monospace;white-space:nowrap;
+}
+.example-badge.kind{background:rgba(100,130,255,.12);color:#bfd0ff}
+.example-badge.kind.video{background:rgba(0,90,255,.12);color:#a7c4ff}
+.example-prompt-text{
+    padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
+    display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
+}
+.panel-card{border-bottom:1px solid #1e2b52}
+.panel-card-title{
+    padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
+    text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
+}
+.panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
+.modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
+.modern-textarea{
+    width:100%;background:#08101d;border:1px solid #1e2b52;border-radius:8px;
+    padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
+    resize:none;outline:none;min-height:100px;transition:border-color .2s;
+}
+.modern-textarea:focus{border-color:#0000FF;box-shadow:0 0 0 3px rgba(0,0,255,.15)}
+.modern-textarea::placeholder{color:#4e5d89}
+.modern-textarea.error-flash{
+    border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
+}
+@keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}}
+.toast-notification{
+    position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
+    z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
+    font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
+    box-shadow:0 8px 24px rgba(0,0,0,.5);
+    transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
+}
+.toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
+.toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
+.toast-notification.warning{background:linear-gradient(135deg,#1d4ed8,#1e40af);color:#fff;border:1px solid rgba(255,255,255,.15)}
+.toast-notification.info{background:linear-gradient(135deg,#0000FF,#1d4ed8);color:#fff;border:1px solid rgba(255,255,255,.15)}
+.toast-notification .toast-icon{font-size:16px;line-height:1}
+.toast-notification .toast-text{line-height:1.3}
+.btn-run{
+    display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
+    background:linear-gradient(135deg,#0000FF,#1d4ed8);border:none;border-radius:10px;
+    padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
+    color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
+    transition:all .2s ease;letter-spacing:-.2px;
+    box-shadow:0 4px 16px rgba(0,0,255,.3),inset 0 1px 0 rgba(255,255,255,.1);
+}
+.btn-run:hover{
+    background:linear-gradient(135deg,#315cff,#0000FF);transform:translateY(-1px);
+    box-shadow:0 6px 24px rgba(0,0,255,.45),inset 0 1px 0 rgba(255,255,255,.15);
+}
+.btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(0,0,255,.3)}
+#custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{
+    color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
+}
+.output-frame{border-bottom:1px solid #1e2b52;display:flex;flex-direction:column;position:relative}
+.output-frame .out-title,
+.output-frame .out-title *,
+#output-title-label{
+    color:#ffffff!important;
+    -webkit-text-fill-color:#ffffff!important;
+}
+.output-frame .out-title{
+    padding:10px 20px;font-size:13px;font-weight:700;
+    text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
+    display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
+}
+.out-title-right{display:flex;gap:8px;align-items:center}
+.out-action-btn{
+    display:inline-flex;align-items:center;justify-content:center;background:rgba(0,0,255,.1);
+    border:1px solid rgba(0,0,255,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
+    font-size:11px;font-weight:500;color:#93a7ff!important;gap:4px;height:24px;transition:all .15s;
+}
+.out-action-btn:hover{background:rgba(0,0,255,.2);border-color:rgba(0,0,255,.35);color:#ffffff!important}
+.out-action-btn svg{width:12px;height:12px;fill:#93a7ff}
+.output-frame .out-body{
+    flex:1;background:#08101d;display:flex;align-items:stretch;justify-content:stretch;
+    overflow:hidden;min-height:320px;position:relative;
+}
+.output-scroll-wrap{
+    width:100%;height:100%;padding:0;overflow:hidden;
+}
+.output-textarea{
+    width:100%;height:320px;min-height:320px;max-height:320px;background:#08101d;color:#e4e4e7;
+    border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
+    font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
+}
+.output-textarea::placeholder{color:#5f6d96}
+.output-textarea.error-flash{
+    box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
+}
+.modern-loader{
+    display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(8,16,29,.92);
+    z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
+}
+.modern-loader.active{display:flex}
+.modern-loader .loader-spinner{
+    width:36px;height:36px;border:3px solid #243669;border-top-color:#0000FF;
+    border-radius:50%;animation:spin .8s linear infinite;
+}
+@keyframes spin{to{transform:rotate(360deg)}}
+.modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
+.loader-bar-track{width:200px;height:4px;background:#243669;border-radius:2px;overflow:hidden}
+.loader-bar-fill{
+    height:100%;background:linear-gradient(90deg,#0000FF,#4b74ff,#0000FF);
+    background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
+}
+@keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
+.settings-group{border:1px solid #1e2b52;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
+.settings-group-title{
+    font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
+    padding:10px 16px;border-bottom:1px solid #1e2b52;background:rgba(17,24,45,.5);
+}
+.settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
+.slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
+.slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
+.slider-row input[type="range"]{
+    flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#243669;
+    border-radius:3px;outline:none;min-width:0;
+}
+.slider-row input[type="range"]::-webkit-slider-thumb{
+    -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
+    border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(0,0,255,.4);transition:transform .15s;
+}
+.slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
+.slider-row input[type="range"]::-moz-range-thumb{
+    width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
+    border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(0,0,255,.4);
+}
+.slider-row .slider-val{
+    min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
+    font-weight:500;padding:3px 8px;background:#08101d;border:1px solid #1e2b52;
+    border-radius:6px;color:#a1a1aa;flex-shrink:0;
+}
+.app-statusbar{
+    background:#11182d;border-top:1px solid #1e2b52;padding:6px 20px;
+    display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
+}
+.app-statusbar .sb-section{
+    padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
+    font-size:12px;color:#6b7cae;overflow:hidden;white-space:nowrap;
+}
+.app-statusbar .sb-section.sb-fixed{
+    flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
+    padding:3px 12px;background:rgba(0,0,255,.08);border-radius:6px;color:#93a7ff;font-weight:500;
+}
+.exp-note{padding:10px 20px;font-size:12px;color:#6b7cae;border-top:1px solid #1e2b52;text-align:center}
+.exp-note a{color:#93a7ff;text-decoration:none}
+.exp-note a:hover{text-decoration:underline}
+::-webkit-scrollbar{width:8px;height:8px}
+::-webkit-scrollbar-track{background:#08101d}
+::-webkit-scrollbar-thumb{background:#243669;border-radius:4px}
+::-webkit-scrollbar-thumb:hover{background:#38529a}
+@media(max-width:980px){
+    .app-main-row{flex-direction:column}
+    .app-main-right{width:100%}
+    .app-main-left{border-right:none;border-bottom:1px solid #1e2b52}
+}
+"""
+gallery_js = r"""
+() => {
+function init() {
+    if (window.__visionScopeInitDone) return;
+    const dropZone = document.getElementById('media-drop-zone');
+    const uploadPrompt = document.getElementById('upload-prompt');
+    const uploadClick = document.getElementById('upload-click-area');
+    const fileInput = document.getElementById('custom-file-input');
+    const previewWrap = document.getElementById('single-preview-wrap');
+    const previewImg = document.getElementById('single-preview-img');
+    const previewVideo = document.getElementById('single-preview-video');
+    const btnUpload = document.getElementById('preview-upload-btn');
+    const btnClear = document.getElementById('preview-clear-btn');
+    const promptInput = document.getElementById('custom-query-input');
+    const runBtnEl = document.getElementById('custom-run-btn');
+    const outputArea = document.getElementById('custom-output-textarea');
+    const mediaStatus = document.getElementById('sb-media-status');
+    const exampleResultContainer = document.getElementById('example-result-data');
+    if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
+        setTimeout(init, 250);
+        return;
+    }
+    window.__visionScopeInitDone = true;
+    let fileState = null;
+    let toastTimer = null;
+    function showToast(message, type) {
+        let toast = document.getElementById('app-toast');
+        if (!toast) {
+            toast = document.createElement('div');
+            toast.id = 'app-toast';
+            toast.className = 'toast-notification';
+            toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>';
+            document.body.appendChild(toast);
+        }
+        const icon = toast.querySelector('.toast-icon');
+        const text = toast.querySelector('.toast-text');
+        toast.className = 'toast-notification ' + (type || 'error');
+        if (type === 'warning') icon.textContent = '\u26A0';
+        else if (type === 'info') icon.textContent = '\u2139';
+        else icon.textContent = '\u2717';
+        text.textContent = message;
+        if (toastTimer) clearTimeout(toastTimer);
+        void toast.offsetWidth;
+        toast.classList.add('visible');
+        toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
+    }
+    window.__showToast = showToast;
+    function showLoader() {
+        const l = document.getElementById('output-loader');
+        if (l) l.classList.add('active');
+        const sb = document.getElementById('sb-run-state');
+        if (sb) sb.textContent = 'Processing...';
+    }
+    function hideLoader() {
+        const l = document.getElementById('output-loader');
+        if (l) l.classList.remove('active');
+        const sb = document.getElementById('sb-run-state');
+        if (sb) sb.textContent = 'Done';
+    }
+    window.__showLoader = showLoader;
+    window.__hideLoader = hideLoader;
+    function flashPromptError() {
+        promptInput.classList.add('error-flash');
+        promptInput.focus();
+        setTimeout(() => promptInput.classList.remove('error-flash'), 800);
+    }
+    function flashOutputError() {
+        if (!outputArea) return;
+        outputArea.classList.add('error-flash');
+        setTimeout(() => outputArea.classList.remove('error-flash'), 800);
+    }
+    function setGradioValue(containerId, value) {
+        const container = document.getElementById(containerId);
+        if (!container) return;
+        container.querySelectorAll('input, textarea').forEach(el => {
+            if (el.type === 'file' || el.type === 'range' || el.type === 'checkbox') return;
+            const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
+            const ns = Object.getOwnPropertyDescriptor(proto, 'value');
+            if (ns && ns.set) {
+                ns.set.call(el, value);
+                el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
+                el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
+            }
+        });
+    }
+    function syncFileToGradio() {
+        setGradioValue('hidden-file-b64', fileState ? fileState.b64 : '');
+        setGradioValue('hidden-input-kind', fileState ? fileState.kind : getActiveMode());
+        const txt = fileState ? ('1 ' + fileState.kind + ' uploaded') : ('No ' + getActiveMode() + ' uploaded');
+        if (mediaStatus) mediaStatus.textContent = txt;
+    }
+    function syncPromptToGradio() {
+        setGradioValue('prompt-gradio-input', promptInput.value);
+    }
+    function syncModelToGradio(name) {
+        setGradioValue('hidden-model-name', name);
+    }
+    function syncModeToGradio(mode) {
+        setGradioValue('hidden-mode-name', mode);
+        setGradioValue('hidden-input-kind', fileState ? fileState.kind : mode);
+        const sub = document.getElementById('upload-sub-text');
+        const main = document.getElementById('upload-main-text');
+        if (mode === 'video') {
+            if (main) main.textContent = 'Click or drag a video here';
+            if (sub) sub.textContent = 'Upload one short video clip for multimodal video understanding';
+        } else {
+            if (main) main.textContent = 'Click or drag an image here';
+            if (sub) sub.textContent = 'Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks';
+        }
+        if (!fileState && mediaStatus) mediaStatus.textContent = 'No ' + mode + ' uploaded';
+    }
+    function getActiveMode() {
+        const active = document.querySelector('.mode-tab.active');
+        return active ? active.getAttribute('data-mode') : 'image';
+    }
+    function activateModeTab(name) {
+        document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
+            btn.classList.toggle('active', btn.getAttribute('data-mode') === name);
+        });
+        syncModeToGradio(name);
+        if (fileState && fileState.kind !== name) {
+            clearPreview();
+        }
+    }
+    window.__activateModeTab = activateModeTab;
+    function activateModelTab(name) {
+        document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
+            btn.classList.toggle('active', btn.getAttribute('data-model') === name);
+        });
+        syncModelToGradio(name);
+    }
+    window.__activateModelTab = activateModelTab;
+    function setPreview(kind, b64, name) {
+        fileState = {kind, b64, name: name || kind};
+        if (kind === 'video') {
+            previewVideo.src = b64;
+            previewVideo.style.display = 'block';
+            previewImg.style.display = 'none';
+            previewVideo.load();
+        } else {
+            previewImg.src = b64;
+            previewImg.style.display = 'block';
+            previewVideo.style.display = 'none';
+            previewVideo.removeAttribute('src');
+        }
+        previewWrap.style.display = 'flex';
+        if (uploadPrompt) uploadPrompt.style.display = 'none';
+        activateModeTab(kind);
+        syncFileToGradio();
+    }
+    window.__setPreview = setPreview;
+    function clearPreview() {
+        fileState = null;
+        previewImg.src = '';
+        previewVideo.pause();
+        previewVideo.removeAttribute('src');
+        previewVideo.load();
+        previewWrap.style.display = 'none';
+        if (uploadPrompt) uploadPrompt.style.display = 'flex';
+        syncFileToGradio();
+    }
+    window.__clearPreview = clearPreview;
+    function processFile(file) {
+        if (!file) return;
+        const mode = getActiveMode();
+        if (mode === 'image' && !file.type.startsWith('image/')) {
+            showToast('Only image files are supported in Image mode', 'error');
+            return;
+        }
+        if (mode === 'video' && !file.type.startsWith('video/')) {
+            showToast('Only video files are supported in Video mode', 'error');
+            return;
+        }
+        const reader = new FileReader();
+        reader.onload = (e) => setPreview(mode, e.target.result, file.name);
+        reader.readAsDataURL(file);
+    }
+    fileInput.addEventListener('change', (e) => {
+        const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
+        if (file) processFile(file);
+        e.target.value = '';
+    });
+    if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
+    if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
+    if (btnClear) btnClear.addEventListener('click', clearPreview);
+    dropZone.addEventListener('dragover', (e) => {
+        e.preventDefault();
+        dropZone.classList.add('drag-over');
+    });
+    dropZone.addEventListener('dragleave', (e) => {
+        e.preventDefault();
+        dropZone.classList.remove('drag-over');
+    });
+    dropZone.addEventListener('drop', (e) => {
+        e.preventDefault();
+        dropZone.classList.remove('drag-over');
+        if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
+    });
+    promptInput.addEventListener('input', syncPromptToGradio);
+    document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
+        btn.addEventListener('click', () => {
+            const model = btn.getAttribute('data-model');
+            activateModelTab(model);
+        });
+    });
+    document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
+        btn.addEventListener('click', () => {
+            const mode = btn.getAttribute('data-mode');
+            activateModeTab(mode);
+        });
+    });
+    activateModelTab('DeepCaption-VLA-7B');
+    activateModeTab('image');
+    function syncSlider(customId, gradioId) {
+        const slider = document.getElementById(customId);
+        const valSpan = document.getElementById(customId + '-val');
+        if (!slider) return;
+        slider.addEventListener('input', () => {
+            if (valSpan) valSpan.textContent = slider.value;
+            const container = document.getElementById(gradioId);
+            if (!container) return;
+            container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
+                const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
+                if (ns && ns.set) {
+                    ns.set.call(el, slider.value);
+                    el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
+                    el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
+                }
+            });
+        });
+    }
+    syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens');
+    syncSlider('custom-temperature', 'gradio-temperature');
+    syncSlider('custom-top-p', 'gradio-top-p');
+    syncSlider('custom-top-k', 'gradio-top-k');
+    syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty');
+    syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
+    function validateBeforeRun() {
+        const promptVal = promptInput.value.trim();
+        const currentMode = getActiveMode();
+        if (!fileState && !promptVal) {
+            showToast('Please upload a file and enter your instruction', 'error');
+            flashPromptError();
+            return false;
+        }
+        if (!fileState) {
+            showToast('Please upload a ' + currentMode, 'error');
+            return false;
+        }
+        if (!promptVal) {
+            showToast('Please enter your vision/query instruction', 'warning');
+            flashPromptError();
+            return false;
+        }
+        if (fileState.kind !== currentMode) {
+            showToast('Uploaded file type does not match active mode', 'error');
+            return false;
+        }
+        const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
+        if (!currentModel) {
+            showToast('Please select a model', 'error');
+            return false;
+        }
+        return true;
+    }
+    window.__clickGradioRunBtn = function() {
+        if (!validateBeforeRun()) return;
+        syncPromptToGradio();
+        syncFileToGradio();
+        const activeModel = document.querySelector('.model-tab.active');
+        if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
+        syncModeToGradio(getActiveMode());
+        if (outputArea) outputArea.value = '';
+        showLoader();
+        setTimeout(() => {
+            const gradioBtn = document.getElementById('gradio-run-btn');
+            if (!gradioBtn) return;
+            const btn = gradioBtn.querySelector('button');
+            if (btn) btn.click(); else gradioBtn.click();
+        }, 180);
+    };
+    if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
+    const copyBtn = document.getElementById('copy-output-btn');
+    if (copyBtn) {
+        copyBtn.addEventListener('click', async () => {
+            try {
+                const text = outputArea ? outputArea.value : '';
+                if (!text.trim()) {
+                    showToast('No output to copy', 'warning');
+                    return;
+                }
+                await navigator.clipboard.writeText(text);
+                showToast('Output copied to clipboard', 'info');
+            } catch(e) {
+                showToast('Copy failed', 'error');
+            }
+        });
+    }
+    const saveBtn = document.getElementById('save-output-btn');
+    if (saveBtn) {
+        saveBtn.addEventListener('click', () => {
+            const text = outputArea ? outputArea.value : '';
+            if (!text.trim()) {
+                showToast('No output to save', 'warning');
+                flashOutputError();
+                return;
+            }
+            const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
+            const a = document.createElement('a');
+            a.href = URL.createObjectURL(blob);
+            a.download = 'visionscope_r2_output.txt';
+            document.body.appendChild(a);
+            a.click();
+            setTimeout(() => {
+                URL.revokeObjectURL(a.href);
+                document.body.removeChild(a);
+            }, 200);
+            showToast('Output saved', 'info');
+        });
+    }
+    document.querySelectorAll('.example-card[data-idx]').forEach(card => {
+        card.addEventListener('click', () => {
+            const idx = card.getAttribute('data-idx');
+            const kind = card.getAttribute('data-kind');
+            document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+            card.classList.add('loading');
+            showToast('Loading example...', 'info');
+            setGradioValue('example-result-data', '');
+            setGradioValue('example-kind-input', kind);
+            setGradioValue('example-idx-input', idx);
+            setTimeout(() => {
+                const btn = document.getElementById('example-load-btn');
+                if (btn) {
+                    const b = btn.querySelector('button');
+                    if (b) b.click(); else btn.click();
+                }
+            }, 150);
+            setTimeout(() => card.classList.remove('loading'), 12000);
+        });
+    });
+    function checkExampleResult() {
+        if (!exampleResultContainer) return;
+        const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
+        if (!el || !el.value) return;
+        if (window.__lastExampleVal === el.value) return;
+        try {
+            const data = JSON.parse(el.value);
+            if (data.status === 'ok') {
+                window.__lastExampleVal = el.value;
+                if (data.file && data.kind) setPreview(data.kind, data.file, data.name || 'example');
+                if (data.query) {
+                    promptInput.value = data.query;
+                    syncPromptToGradio();
+                }
+                if (data.model) activateModelTab(data.model);
+                if (data.kind) activateModeTab(data.kind);
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast('Example loaded', 'info');
+            } else if (data.status === 'error') {
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast(data.message || 'Failed to load example', 'error');
+            }
+        } catch(e) {}
+    }
+    const obsExample = new MutationObserver(checkExampleResult);
+    if (exampleResultContainer) {
+        obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
+    }
+    setInterval(checkExampleResult, 500);
+    if (outputArea) outputArea.value = '';
+    const sb = document.getElementById('sb-run-state');
+    if (sb) sb.textContent = 'Ready';
+    if (mediaStatus) mediaStatus.textContent = 'No image uploaded';
+}
+init();
+}
+"""
+wire_outputs_js = r"""
+() => {
+function watchOutputs() {
+    const resultContainer = document.getElementById('gradio-result');
+    const outArea = document.getElementById('custom-output-textarea');
+    if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
+    let lastText = '';
+    function syncOutput() {
+        const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
+        if (!el) return;
+        const val = el.value || '';
+        if (val !== lastText) {
+            lastText = val;
+            outArea.value = val;
+            outArea.scrollTop = outArea.scrollHeight;
+            if (window.__hideLoader && val.trim()) window.__hideLoader();
+        }
+    }
+    const observer = new MutationObserver(syncOutput);
+    observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
+    setInterval(syncOutput, 500);
+}
+watchOutputs();
+}
+"""
+APP_LOGO_SVG = """
+<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
+  <path d="M12 2 4 6v6c0 5 3.4 9.4 8 10 4.6-.6 8-5 8-10V6l-8-4Z" fill="white"/>
+  <path d="M9 11.5 11 13.5 15.5 9" stroke="#0000FF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
+</svg>
+"""
+UPLOAD_PREVIEW_SVG = """
+<svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#0000FF" stroke-width="2" stroke-dasharray="4 3"/>
+    <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(0,0,255,0.15)" stroke="#0000FF" stroke-width="1.5"/>
+    <circle cx="28" cy="30" r="6" fill="rgba(0,0,255,0.2)" stroke="#0000FF" stroke-width="1.5"/>
+</svg>
+"""
+COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>"""
+SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>"""
+MODEL_TABS_HTML = "".join([
+    f'<button class="model-tab{" active" if m == "DeepCaption-VLA-7B" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>'
+    for m in MODEL_CHOICES
+])
+MODE_TABS_HTML = """
+<button class="mode-tab active" data-mode="image"><span class="mode-tab-label">Image Inference</span></button>
+<button class="mode-tab" data-mode="video"><span class="mode-tab-label">Video Inference</span></button>
+"""
 with gr.Blocks() as demo:
+    hidden_file_b64 = gr.Textbox(value="", elem_id="hidden-file-b64", elem_classes="hidden-input", container=False)
+    hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
+    hidden_input_kind = gr.Textbox(value="image", elem_id="hidden-input-kind", elem_classes="hidden-input", container=False)
+    prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
+    hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
+    max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False)
+    temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.6, elem_id="gradio-temperature", elem_classes="hidden-input", container=False)
+    top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False)
+    top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False)
+    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False)
+    gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
+    result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
+    example_kind = gr.Textbox(value="", elem_id="example-kind-input", elem_classes="hidden-input", container=False)
+    example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
+    example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
+    example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
+    gr.HTML(f"""
+    <div class="app-shell">
+        <div class="app-header">
+            <div class="app-header-left">
+                <div class="app-logo">{APP_LOGO_SVG}</div>
+                <span class="app-title">VisionScope R2</span>
+                <span class="app-badge">vision enabled</span>
+                <span class="app-badge fast">Blue Suite</span>
+            </div>
+        </div>
+        <div class="mode-tabs-bar">
+            {MODE_TABS_HTML}
+        </div>
+        <div class="model-tabs-bar">
+            {MODEL_TABS_HTML}
+        </div>
+        <div class="app-main-row">
+            <div class="app-main-left">
+                <div id="media-drop-zone">
+                    <div id="upload-prompt" class="upload-prompt-modern">
+                        <div id="upload-click-area" class="upload-click-area">
+                            {UPLOAD_PREVIEW_SVG}
+                            <span id="upload-main-text" class="upload-main-text">Click or drag an image here</span>
+                            <span id="upload-sub-text" class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks</span>
+                        </div>
+                    </div>
+                    <input id="custom-file-input" type="file" accept="image/*,video/*" style="display:none;" />
+                    <div id="single-preview-wrap" class="single-preview-wrap">
+                        <div class="single-preview-card">
+                            <img id="single-preview-img" src="" alt="Preview" style="display:none;">
+                            <video id="single-preview-video" controls playsinline style="display:none;"></video>
+                            <div class="preview-overlay-actions">
+                                <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
+                                <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <div class="hint-bar">
+                    <b>Upload:</b> Click or drag to add an image or video &nbsp;&middot;&nbsp;
+                    <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
+                    <b>Model:</b> Choose model tabs from the header
+                </div>
+                <div class="examples-section">
+                    <div class="examples-title">Quick Examples</div>
+                    <div class="examples-scroll">
+                        {EXAMPLE_CARDS_HTML}
+                    </div>
+                </div>
+            </div>
+            <div class="app-main-right">
+                <div class="panel-card">
+                    <div class="panel-card-title">Vision Instruction</div>
+                    <div class="panel-card-body">
+                        <label class="modern-label" for="custom-query-input">Query Input</label>
+                        <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., extract the text, describe the image, explain the scene, summarize the video, count objects, estimate distance..."></textarea>
+                    </div>
+                </div>
+                <div style="padding:12px 20px;">
+                    <button id="custom-run-btn" class="btn-run">
+                        <span id="run-btn-label">Run Vision</span>
+                    </button>
+                </div>
+                <div class="output-frame">
+                    <div class="out-title">
+                        <span id="output-title-label">Raw Output Stream</span>
+                        <div class="out-title-right">
+                            <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button>
+                            <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button>
+                        </div>
+                    </div>
+                    <div class="out-body">
+                        <div class="modern-loader" id="output-loader">
+                            <div class="loader-spinner"></div>
+                            <div class="loader-text">Running vision inference...</div>
+                            <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
+                        </div>
+                        <div class="output-scroll-wrap">
+                            <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea>
+                        </div>
+                    </div>
+                </div>
+                <div class="settings-group">
+                    <div class="settings-group-title">Advanced Settings</div>
+                    <div class="settings-group-body">
+                        <div class="slider-row">
+                            <label>Max new tokens</label>
+                            <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}">
+                            <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span>
+                        </div>
+                        <div class="slider-row">
+                            <label>Temperature</label>
+                            <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.6">
+                            <span class="slider-val" id="custom-temperature-val">0.6</span>
+                        </div>
+                        <div class="slider-row">
+                            <label>Top-p</label>
+                            <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9">
+                            <span class="slider-val" id="custom-top-p-val">0.9</span>
+                        </div>
+                        <div class="slider-row">
+                            <label>Top-k</label>
+                            <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50">
+                            <span class="slider-val" id="custom-top-k-val">50</span>
+                        </div>
+                        <div class="slider-row">
+                            <label>Repetition penalty</label>
+                            <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.2">
+                            <span class="slider-val" id="custom-repetition-penalty-val">1.2</span>
+                        </div>
+                        <div class="slider-row">
+                            <label>GPU Duration (seconds)</label>
+                            <input type="range" id="custom-gpu-duration" min="60" max="300" step="30" value="60">
+                            <span class="slider-val" id="custom-gpu-duration-val">60</span>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <div class="exp-note">
+            Experimental Vision Suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/VisionScope-R2" target="_blank">GitHub</a>
+        </div>
+        <div class="app-statusbar">
+            <div class="sb-section" id="sb-media-status">No image uploaded</div>
+            <div class="sb-section sb-fixed" id="sb-run-state">Ready</div>
+        </div>
+    </div>
+    """)
+    run_btn = gr.Button("Run", elem_id="gradio-run-btn")
+    def b64_to_pil(b64_str):
+        if not b64_str:
+            return None
+        try:
+            if b64_str.startswith("data:image"):
+                _, data = b64_str.split(",", 1)
+            else:
+                data = b64_str
+            image_data = base64.b64decode(data)
+            return Image.open(BytesIO(image_data)).convert("RGB")
+        except Exception:
+            return None
+    def b64_to_temp_video(b64_str):
+        if not b64_str:
+            return None
+        try:
+            header, data = b64_str.split(",", 1) if "," in b64_str else ("", b64_str)
+            os.makedirs("/tmp/visionscope_r2", exist_ok=True)
+            ext = ".mp4"
+            if "video/webm" in header:
+                ext = ".webm"
+            elif "video/quicktime" in header:
+                ext = ".mov"
+            elif "video/x-matroska" in header:
+                ext = ".mkv"
+            path = f"/tmp/visionscope_r2/{uuid.uuid4().hex}{ext}"
+            with open(path, "wb") as f:
+                f.write(base64.b64decode(data))
+            return path
+        except Exception:
+            return None
+    def run_vision(mode_name, model_name, text, file_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
+        if mode_name == "video":
+            temp_video_path = b64_to_temp_video(file_b64)
+            if not temp_video_path:
+                raise gr.Error("Failed to decode uploaded video.")
+            try:
+                yield from generate_video(
+                    model_name=model_name,
+                    text=text,
+                    video_path=temp_video_path,
+                    max_new_tokens=max_new_tokens_v,
+                    temperature=temperature_v,
+                    top_p=top_p_v,
+                    top_k=top_k_v,
+                    repetition_penalty=repetition_penalty_v,
+                    gpu_timeout=gpu_timeout_v,
+                )
+            finally:
+                try:
+                    if os.path.exists(temp_video_path):
+                        os.remove(temp_video_path)
+                except Exception:
+                    pass
+        else:
+            image = b64_to_pil(file_b64)
+            yield from generate_image(
+                model_name=model_name,
+                text=text,
+                image=image,
+                max_new_tokens=max_new_tokens_v,
+                temperature=temperature_v,
+                top_p=top_p_v,
+                top_k=top_k_v,
+                repetition_penalty=repetition_penalty_v,
+                gpu_timeout=gpu_timeout_v,
             )
+    demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
+    demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
+    run_btn.click(
+        fn=run_vision,
+        inputs=[
+            hidden_mode_name,
+            hidden_model_name,
+            prompt,
+            hidden_file_b64,
+            max_new_tokens,
+            temperature,
+            top_p,
+            top_k,
+            repetition_penalty,
+            gpu_duration_state,
+        ],
+        outputs=[result],
+        js=r"""(mode, model, p, filev, mnt, t, tp, tk, rp, gd) => {
+            const modelEl = document.querySelector('.model-tab.active');
+            const modeEl = document.querySelector('.mode-tab.active');
+            const chosenModel = modelEl ? modelEl.getAttribute('data-model') : model;
+            const chosenMode = modeEl ? modeEl.getAttribute('data-mode') : mode;
+            const promptEl = document.getElementById('custom-query-input');
+            const promptVal = promptEl ? promptEl.value : p;
+            const fileContainer = document.getElementById('hidden-file-b64');
+            let fileVal = filev;
+            if (fileContainer) {
+                const inner = fileContainer.querySelector('textarea, input');
+                if (inner) fileVal = inner.value;
+            }
+            return [chosenMode, chosenModel, promptVal, fileVal, mnt, t, tp, tk, rp, gd];
+        }""",
     )
+    example_load_btn.click(
+        fn=load_example_data,
+        inputs=[example_kind, example_idx],
+        outputs=[example_result],
+        queue=False,
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(
+        css=css,
+        mcp_server=True,
+        ssr_mode=False,
+        show_error=True,
+        allowed_paths=["images", "videos"],
+    )