prithivMLmods commited on
Commit
3e64a53
·
verified ·
1 Parent(s): 18a6809

update app

Browse files
Files changed (1) hide show
  1. app.py +1306 -339
app.py CHANGED
@@ -1,214 +1,35 @@
1
  import os
 
 
 
 
 
2
  import random
3
  import uuid
4
- import json
5
- import requests
6
  import time
7
- import asyncio
8
  from threading import Thread
9
- from typing import Iterable
10
 
11
  import gradio as gr
12
  import spaces
13
  import torch
14
  import numpy as np
15
- from PIL import Image
16
  import cv2
17
 
18
  from transformers import (
19
  Qwen2_5_VLForConditionalGeneration,
20
- Qwen2VLForConditionalGeneration,
21
  AutoProcessor,
22
- AutoTokenizer,
23
  TextIteratorStreamer,
24
  )
25
- from gradio.themes import Soft
26
- from gradio.themes.utils import colors, fonts, sizes
27
-
28
- colors.steel_blue = colors.Color(
29
- name="steel_blue",
30
- c50="#EBF3F8",
31
- c100="#D3E5F0",
32
- c200="#A8CCE1",
33
- c300="#7DB3D2",
34
- c400="#529AC3",
35
- c500="#4682B4",
36
- c600="#3E72A0",
37
- c700="#36638C",
38
- c800="#2E5378",
39
- c900="#264364",
40
- c950="#1E3450",
41
- )
42
-
43
- class SteelBlueTheme(Soft):
44
- def __init__(
45
- self,
46
- *,
47
- primary_hue: colors.Color | str = colors.gray,
48
- secondary_hue: colors.Color | str = colors.steel_blue,
49
- neutral_hue: colors.Color | str = colors.slate,
50
- text_size: sizes.Size | str = sizes.text_lg,
51
- font: fonts.Font | str | Iterable[fonts.Font | str] = (
52
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
53
- ),
54
- font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
55
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
56
- ),
57
- ):
58
- super().__init__(
59
- primary_hue=primary_hue,
60
- secondary_hue=secondary_hue,
61
- neutral_hue=neutral_hue,
62
- text_size=text_size,
63
- font=font,
64
- font_mono=font_mono,
65
- )
66
- super().set(
67
- background_fill_primary="*primary_50",
68
- background_fill_primary_dark="*primary_900",
69
- body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
70
- body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
71
- button_primary_text_color="white",
72
- button_primary_text_color_hover="white",
73
- button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
74
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
75
- button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
76
- button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
77
- slider_color="*secondary_500",
78
- slider_color_dark="*secondary_600",
79
- block_title_text_weight="600",
80
- block_border_width="3px",
81
- block_shadow="*shadow_drop_lg",
82
- button_primary_shadow="*shadow_drop_lg",
83
- button_large_padding="11px",
84
- color_accent_soft="*primary_100",
85
- block_label_background_fill="*primary_200",
86
- )
87
-
88
- steel_blue_theme = SteelBlueTheme()
89
-
90
- css = """
91
- #main-title h1 {
92
- font-size: 2.3em !important;
93
- }
94
- #output-title h2 {
95
- font-size: 2.2em !important;
96
- }
97
-
98
- /* RadioAnimated Styles */
99
- .ra-wrap{ width: fit-content; }
100
- .ra-inner{
101
- position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
102
- background: var(--neutral-200); border-radius: 9999px; overflow: hidden;
103
- }
104
- .ra-input{ display: none; }
105
- .ra-label{
106
- position: relative; z-index: 2; padding: 8px 16px;
107
- font-family: inherit; font-size: 14px; font-weight: 600;
108
- color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap;
109
- }
110
- .ra-highlight{
111
- position: absolute; z-index: 1; top: 6px; left: 6px;
112
- height: calc(100% - 12px); border-radius: 9999px;
113
- background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
114
- transition: transform 0.2s, width 0.2s;
115
- }
116
- .ra-input:checked + .ra-label{ color: black; }
117
-
118
- /* Dark mode adjustments for Radio */
119
- .dark .ra-inner { background: var(--neutral-800); }
120
- .dark .ra-label { color: var(--neutral-400); }
121
- .dark .ra-highlight { background: var(--neutral-600); }
122
- .dark .ra-input:checked + .ra-label { color: white; }
123
-
124
- #gpu-duration-container {
125
- padding: 10px;
126
- border-radius: 8px;
127
- background: var(--background-fill-secondary);
128
- border: 1px solid var(--border-color-primary);
129
- margin-top: 10px;
130
- }
131
- """
132
 
133
  MAX_MAX_NEW_TOKENS = 4096
134
  DEFAULT_MAX_NEW_TOKENS = 1024
135
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
136
 
137
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
138
-
139
- class RadioAnimated(gr.HTML):
140
- def __init__(self, choices, value=None, **kwargs):
141
- if not choices or len(choices) < 2:
142
- raise ValueError("RadioAnimated requires at least 2 choices.")
143
- if value is None:
144
- value = choices[0]
145
-
146
- uid = uuid.uuid4().hex[:8]
147
- group_name = f"ra-{uid}"
148
-
149
- inputs_html = "\n".join(
150
- f"""
151
- <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
152
- <label class="ra-label" for="{group_name}-{i}">{c}</label>
153
- """
154
- for i, c in enumerate(choices)
155
- )
156
-
157
- html_template = f"""
158
- <div class="ra-wrap" data-ra="{uid}">
159
- <div class="ra-inner">
160
- <div class="ra-highlight"></div>
161
- {inputs_html}
162
- </div>
163
- </div>
164
- """
165
-
166
- js_on_load = r"""
167
- (() => {
168
- const wrap = element.querySelector('.ra-wrap');
169
- const inner = element.querySelector('.ra-inner');
170
- const highlight = element.querySelector('.ra-highlight');
171
- const inputs = Array.from(element.querySelectorAll('.ra-input'));
172
-
173
- if (!inputs.length) return;
174
-
175
- const choices = inputs.map(i => i.value);
176
-
177
- function setHighlightByIndex(idx) {
178
- const n = choices.length;
179
- const pct = 100 / n;
180
- highlight.style.width = `calc(${pct}% - 6px)`;
181
- highlight.style.transform = `translateX(${idx * 100}%)`;
182
- }
183
-
184
- function setCheckedByValue(val, shouldTrigger=false) {
185
- const idx = Math.max(0, choices.indexOf(val));
186
- inputs.forEach((inp, i) => { inp.checked = (i === idx); });
187
- setHighlightByIndex(idx);
188
-
189
- props.value = choices[idx];
190
- if (shouldTrigger) trigger('change', props.value);
191
- }
192
-
193
- setCheckedByValue(props.value ?? choices[0], false);
194
-
195
- inputs.forEach((inp) => {
196
- inp.addEventListener('change', () => {
197
- setCheckedByValue(inp.value, true);
198
- });
199
- });
200
- })();
201
- """
202
-
203
- super().__init__(
204
- value=value,
205
- html_template=html_template,
206
- js_on_load=js_on_load,
207
- **kwargs
208
- )
209
-
210
- def apply_gpu_duration(val: str):
211
- return int(val)
212
 
213
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
214
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
@@ -255,18 +76,175 @@ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
255
  torch_dtype=torch.float16
256
  ).to(device).eval()
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def downsample_video(video_path):
259
- """
260
- Downsamples the video to evenly spaced frames.
261
- Each frame is returned as a PIL image along with its timestamp.
262
- """
263
  vidcap = cv2.VideoCapture(video_path)
264
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
265
  fps = vidcap.get(cv2.CAP_PROP_FPS)
266
  frames = []
 
 
 
267
  frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
268
  for i in frame_indices:
269
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
270
  success, image = vidcap.read()
271
  if success:
272
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -276,53 +254,33 @@ def downsample_video(video_path):
276
  vidcap.release()
277
  return frames
278
 
279
- def calc_timeout_image(model_name: str, text: str, image: Image.Image,
280
- max_new_tokens: int, temperature: float, top_p: float,
281
- top_k: int, repetition_penalty: float, gpu_timeout: int):
282
- """Calculate GPU timeout duration for image inference."""
283
  try:
284
  return int(gpu_timeout)
285
- except:
286
  return 60
287
 
288
- def calc_timeout_video(model_name: str, text: str, video_path: str,
289
- max_new_tokens: int, temperature: float, top_p: float,
290
- top_k: int, repetition_penalty: float, gpu_timeout: int):
291
- """Calculate GPU timeout duration for video inference."""
292
  try:
293
  return int(gpu_timeout)
294
- except:
295
  return 60
296
-
297
- @spaces.GPU(duration=calc_timeout_image)
298
- def generate_image(model_name: str, text: str, image: Image.Image,
299
- max_new_tokens: int = 1024,
300
- temperature: float = 0.6,
301
- top_p: float = 0.9,
302
- top_k: int = 50,
303
- repetition_penalty: float = 1.2,
304
- gpu_timeout: int = 60):
305
- """
306
- Generates responses using the selected model for image input.
307
- Yields raw text and Markdown-formatted text.
308
- """
309
- if model_name == "SkyCaptioner-V1":
310
- processor, model = processor_m, model_m
311
- elif model_name == "DeepCaption-VLA-7B":
312
- processor, model = processor_n, model_n
313
- elif model_name == "SpaceThinker-3B":
314
- processor, model = processor_z, model_z
315
- elif model_name == "coreOCR-7B-050325-preview":
316
- processor, model = processor_k, model_k
317
- elif model_name == "SpaceOm-3B":
318
- processor, model = processor_y, model_y
319
- else:
320
- yield "Invalid model selected.", "Invalid model selected."
321
- return
322
 
 
 
 
 
 
323
  if image is None:
324
- yield "Please upload an image.", "Please upload an image."
325
- return
 
 
 
 
 
326
 
327
  messages = [{
328
  "role": "user",
@@ -331,6 +289,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
331
  {"type": "text", "text": text},
332
  ]
333
  }]
 
334
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
335
  inputs = processor(
336
  text=[prompt_full],
@@ -340,48 +299,49 @@ def generate_image(model_name: str, text: str, image: Image.Image,
340
  truncation=True,
341
  max_length=MAX_INPUT_TOKEN_LENGTH
342
  ).to(device)
 
343
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
344
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
 
345
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
346
  thread.start()
 
347
  buffer = ""
348
  for new_text in streamer:
349
- buffer += new_text
350
- buffer = buffer.replace("<|im_end|>", "")
351
  time.sleep(0.01)
352
- yield buffer, buffer
 
 
 
 
 
353
 
354
  @spaces.GPU(duration=calc_timeout_video)
355
- def generate_video(model_name: str, text: str, video_path: str,
356
- max_new_tokens: int = 1024,
357
- temperature: float = 0.6,
358
- top_p: float = 0.9,
359
- top_k: int = 50,
360
- repetition_penalty: float = 1.2,
361
- gpu_timeout: int = 90):
362
- """
363
- Generates responses using the selected model for video input.
364
- Yields raw text and Markdown-formatted text.
365
- """
366
- if model_name == "SkyCaptioner-V1":
367
- processor, model = processor_m, model_m
368
- elif model_name == "DeepCaption-VLA-7B":
369
- processor, model = processor_n, model_n
370
- elif model_name == "SpaceThinker-3B":
371
- processor, model = processor_z, model_z
372
- elif model_name == "coreOCR-7B-050325-preview":
373
- processor, model = processor_k, model_k
374
- elif model_name == "SpaceOm-3B":
375
- processor, model = processor_y, model_y
376
- else:
377
- yield "Invalid model selected.", "Invalid model selected."
378
- return
379
-
380
- if video_path is None:
381
- yield "Please upload a video.", "Please upload a video."
382
- return
383
 
 
384
  frames = downsample_video(video_path)
 
 
 
385
  messages = [
386
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
387
  {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -390,6 +350,7 @@ def generate_video(model_name: str, text: str, video_path: str,
390
  image, timestamp = frame
391
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
392
  messages[1]["content"].append({"type": "image", "image": image})
 
393
  inputs = processor.apply_chat_template(
394
  messages,
395
  tokenize=True,
@@ -399,100 +360,1106 @@ def generate_video(model_name: str, text: str, video_path: str,
399
  truncation=True,
400
  max_length=MAX_INPUT_TOKEN_LENGTH
401
  ).to(device)
 
402
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
403
  generation_kwargs = {
404
  **inputs,
405
  "streamer": streamer,
406
- "max_new_tokens": max_new_tokens,
407
  "do_sample": True,
408
- "temperature": temperature,
409
- "top_p": top_p,
410
- "top_k": top_k,
411
- "repetition_penalty": repetition_penalty,
412
  }
 
413
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
414
  thread.start()
 
415
  buffer = ""
416
  for new_text in streamer:
417
- buffer += new_text
418
- buffer = buffer.replace("<|im_end|>", "")
419
  time.sleep(0.01)
420
- yield buffer, buffer
421
 
422
- image_examples = [
423
- ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
424
- ["count the number of birds and explain the scene in detail.", "images/2.jpeg"],
425
- ["how far is the Goal from the penalty taker in this image?.", "images/3.png"],
426
- ["approximately how many meters apart are the chair and bookshelf?.", "images/4.png"],
427
- ["how far is the man in the red hat from the pallet of boxes in feet?.", "images/5.jpg"],
428
- ]
429
 
430
- video_examples = [
431
- ["give the highlights of the movie scene video.", "videos/1.mp4"],
432
- ["explain the advertisement in detail.", "videos/2.mp4"]
433
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
  with gr.Blocks() as demo:
436
- gr.Markdown("# **VisionScope R2**", elem_id="main-title")
437
- with gr.Row():
438
- with gr.Column(scale=2):
439
- with gr.Tabs():
440
- with gr.TabItem("Image Inference"):
441
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
442
- image_upload = gr.Image(type="pil", label="Upload Image", height=290)
443
- image_submit = gr.Button("Submit", variant="primary")
444
- gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
445
- with gr.TabItem("Video Inference"):
446
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
447
- video_upload = gr.Video(label="Upload Video(<= 30s)", height=290)
448
- video_submit = gr.Button("Submit", variant="primary")
449
- gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
450
- with gr.Accordion("Advanced options", open=False):
451
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
452
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
453
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
454
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
455
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
456
- with gr.Column(scale=3):
457
- gr.Markdown("## Output", elem_id="output-title")
458
- output = gr.Textbox(label="Raw Output Stream", interactive=True, lines=11)
459
- with gr.Accordion("(Result.md)", open=False):
460
- markdown_output = gr.Markdown(label="Formatted Result")
461
- model_choice = gr.Radio(
462
- choices=["DeepCaption-VLA-7B", "SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
463
- label="Select Model",
464
- value="DeepCaption-VLA-7B"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  )
466
-
467
- with gr.Row(elem_id="gpu-duration-container"):
468
- with gr.Column():
469
- gr.Markdown("**GPU Duration (seconds)**")
470
- radioanimated_gpu_duration = RadioAnimated(
471
- choices=["60", "90", "120", "180", "240", "300"],
472
- value="60",
473
- elem_id="radioanimated_gpu_duration"
474
- )
475
- gpu_duration_state = gr.Number(value=60, visible=False)
476
-
477
- gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
478
-
479
- radioanimated_gpu_duration.change(
480
- fn=apply_gpu_duration,
481
- inputs=radioanimated_gpu_duration,
482
- outputs=[gpu_duration_state],
483
- api_visibility="private"
484
- )
485
 
486
- image_submit.click(
487
- fn=generate_image,
488
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
489
- outputs=[output, markdown_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  )
491
- video_submit.click(
492
- fn=generate_video,
493
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
494
- outputs=[output, markdown_output]
 
 
495
  )
496
 
497
  if __name__ == "__main__":
498
- demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
 
 
 
 
 
 
1
  import os
2
+ import gc
3
+ import re
4
+ import ast
5
+ import json
6
+ import base64
7
  import random
8
  import uuid
 
 
9
  import time
10
+ from io import BytesIO
11
  from threading import Thread
 
12
 
13
  import gradio as gr
14
  import spaces
15
  import torch
16
  import numpy as np
17
+ from PIL import Image, ImageOps
18
  import cv2
19
 
20
  from transformers import (
21
  Qwen2_5_VLForConditionalGeneration,
22
+ Qwen2VLForConditionalGeneration,
23
  AutoProcessor,
 
24
  TextIteratorStreamer,
25
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  MAX_MAX_NEW_TOKENS = 4096
28
  DEFAULT_MAX_NEW_TOKENS = 1024
29
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
+ print("Using device:", device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
35
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 
76
  torch_dtype=torch.float16
77
  ).to(device).eval()
78
 
79
+ MODEL_MAP = {
80
+ "DeepCaption-VLA-7B": (processor_n, model_n),
81
+ "SkyCaptioner-V1": (processor_m, model_m),
82
+ "SpaceThinker-3B": (processor_z, model_z),
83
+ "coreOCR-7B-050325-preview": (processor_k, model_k),
84
+ "SpaceOm-3B": (processor_y, model_y),
85
+ }
86
+ MODEL_CHOICES = list(MODEL_MAP.keys())
87
+
88
+ image_examples = [
89
+ {"query": "type out the messy hand-writing as accurately as you can.", "image": "images/1.jpg", "model": "coreOCR-7B-050325-preview"},
90
+ {"query": "count the number of birds and explain the scene in detail.", "image": "images/2.jpeg", "model": "DeepCaption-VLA-7B"},
91
+ {"query": "how far is the Goal from the penalty taker in this image?.", "image": "images/3.png", "model": "SpaceThinker-3B"},
92
+ {"query": "approximately how many meters apart are the chair and bookshelf?.", "image": "images/4.png", "model": "SkyCaptioner-V1"},
93
+ {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "image": "images/5.jpg", "model": "SpaceOm-3B"},
94
+ ]
95
+
96
+ video_examples = [
97
+ {"query": "give the highlights of the movie scene video.", "video": "videos/1.mp4", "model": "DeepCaption-VLA-7B"},
98
+ {"query": "explain the advertisement in detail.", "video": "videos/2.mp4", "model": "SkyCaptioner-V1"},
99
+ ]
100
+
101
+
102
+ def pil_to_data_url(img: Image.Image, fmt="PNG"):
103
+ buf = BytesIO()
104
+ img.save(buf, format=fmt)
105
+ data = base64.b64encode(buf.getvalue()).decode()
106
+ mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
107
+ return f"data:{mime};base64,{data}"
108
+
109
+
110
+ def file_to_data_url(path):
111
+ if not os.path.exists(path):
112
+ return ""
113
+ ext = path.rsplit(".", 1)[-1].lower()
114
+ mime = {
115
+ "jpg": "image/jpeg",
116
+ "jpeg": "image/jpeg",
117
+ "png": "image/png",
118
+ "webp": "image/webp",
119
+ "mp4": "video/mp4",
120
+ "mov": "video/quicktime",
121
+ "webm": "video/webm",
122
+ "mkv": "video/x-matroska",
123
+ }.get(ext, "application/octet-stream")
124
+ with open(path, "rb") as f:
125
+ data = base64.b64encode(f.read()).decode()
126
+ return f"data:{mime};base64,{data}"
127
+
128
+
129
+ def make_thumb_b64(path, max_dim=240):
130
+ try:
131
+ img = Image.open(path).convert("RGB")
132
+ img.thumbnail((max_dim, max_dim))
133
+ return pil_to_data_url(img, "JPEG")
134
+ except Exception as e:
135
+ print("Thumbnail error:", e)
136
+ return ""
137
+
138
+
139
+ def make_video_thumb_b64(path, max_dim=240):
140
+ try:
141
+ cap = cv2.VideoCapture(path)
142
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
143
+ target = max(0, total_frames // 2)
144
+ cap.set(cv2.CAP_PROP_POS_FRAMES, target)
145
+ success, frame = cap.read()
146
+ cap.release()
147
+ if not success:
148
+ return ""
149
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
150
+ img = Image.fromarray(frame).convert("RGB")
151
+ img.thumbnail((max_dim, max_dim))
152
+ return pil_to_data_url(img, "JPEG")
153
+ except Exception as e:
154
+ print("Video thumbnail error:", e)
155
+ return ""
156
+
157
+
158
+ def build_example_cards_html():
159
+ cards = ""
160
+ for i, ex in enumerate(image_examples):
161
+ thumb = make_thumb_b64(ex["image"])
162
+ prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
163
+ cards += f"""
164
+ <div class="example-card" data-kind="image" data-idx="{i}">
165
+ <div class="example-thumb-wrap">
166
+ {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
167
+ </div>
168
+ <div class="example-meta-row">
169
+ <span class="example-badge">{ex["model"]}</span>
170
+ <span class="example-badge kind">IMAGE</span>
171
+ </div>
172
+ <div class="example-prompt-text">{prompt_short}</div>
173
+ </div>
174
+ """
175
+ for i, ex in enumerate(video_examples):
176
+ thumb = make_video_thumb_b64(ex["video"])
177
+ prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
178
+ cards += f"""
179
+ <div class="example-card" data-kind="video" data-idx="{i}">
180
+ <div class="example-thumb-wrap">
181
+ {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Video</div>"}
182
+ </div>
183
+ <div class="example-meta-row">
184
+ <span class="example-badge">{ex["model"]}</span>
185
+ <span class="example-badge kind video">VIDEO</span>
186
+ </div>
187
+ <div class="example-prompt-text">{prompt_short}</div>
188
+ </div>
189
+ """
190
+ return cards
191
+
192
+
193
+ EXAMPLE_CARDS_HTML = build_example_cards_html()
194
+
195
+
196
+ def load_example_data(kind, idx_str):
197
+ try:
198
+ idx = int(float(idx_str))
199
+ except Exception:
200
+ return json.dumps({"status": "error", "message": "Invalid example index"})
201
+
202
+ if kind == "image":
203
+ if idx < 0 or idx >= len(image_examples):
204
+ return json.dumps({"status": "error", "message": "Example index out of range"})
205
+ ex = image_examples[idx]
206
+ img_b64 = file_to_data_url(ex["image"])
207
+ if not img_b64:
208
+ return json.dumps({"status": "error", "message": "Could not load example image"})
209
+ return json.dumps({
210
+ "status": "ok",
211
+ "kind": "image",
212
+ "query": ex["query"],
213
+ "file": img_b64,
214
+ "model": ex["model"],
215
+ "name": os.path.basename(ex["image"]),
216
+ })
217
+
218
+ if kind == "video":
219
+ if idx < 0 or idx >= len(video_examples):
220
+ return json.dumps({"status": "error", "message": "Example index out of range"})
221
+ ex = video_examples[idx]
222
+ vid_b64 = file_to_data_url(ex["video"])
223
+ if not vid_b64:
224
+ return json.dumps({"status": "error", "message": "Could not load example video"})
225
+ return json.dumps({
226
+ "status": "ok",
227
+ "kind": "video",
228
+ "query": ex["query"],
229
+ "file": vid_b64,
230
+ "model": ex["model"],
231
+ "name": os.path.basename(ex["video"]),
232
+ })
233
+
234
+ return json.dumps({"status": "error", "message": "Invalid example kind"})
235
+
236
+
237
  def downsample_video(video_path):
 
 
 
 
238
  vidcap = cv2.VideoCapture(video_path)
239
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
240
  fps = vidcap.get(cv2.CAP_PROP_FPS)
241
  frames = []
242
+ if total_frames <= 0 or fps <= 0:
243
+ vidcap.release()
244
+ return frames
245
  frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
246
  for i in frame_indices:
247
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
248
  success, image = vidcap.read()
249
  if success:
250
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
254
  vidcap.release()
255
  return frames
256
 
257
+
258
+ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
 
 
259
  try:
260
  return int(gpu_timeout)
261
+ except Exception:
262
  return 60
263
 
264
+
265
+ def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
 
 
266
  try:
267
  return int(gpu_timeout)
268
+ except Exception:
269
  return 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+
272
+ @spaces.GPU(duration=calc_timeout_image)
273
+ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
274
+ if not model_name or model_name not in MODEL_MAP:
275
+ raise gr.Error("Please select a valid model.")
276
  if image is None:
277
+ raise gr.Error("Please upload an image.")
278
+ if not text or not str(text).strip():
279
+ raise gr.Error("Please enter your vision/query instruction.")
280
+ if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
281
+ raise gr.Error("Query is too long. Please shorten your input.")
282
+
283
+ processor, model = MODEL_MAP[model_name]
284
 
285
  messages = [{
286
  "role": "user",
 
289
  {"type": "text", "text": text},
290
  ]
291
  }]
292
+
293
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
294
  inputs = processor(
295
  text=[prompt_full],
 
299
  truncation=True,
300
  max_length=MAX_INPUT_TOKEN_LENGTH
301
  ).to(device)
302
+
303
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
304
+ generation_kwargs = {
305
+ **inputs,
306
+ "streamer": streamer,
307
+ "max_new_tokens": int(max_new_tokens),
308
+ "temperature": float(temperature),
309
+ "top_p": float(top_p),
310
+ "top_k": int(top_k),
311
+ "repetition_penalty": float(repetition_penalty),
312
+ "do_sample": True,
313
+ }
314
+
315
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
316
  thread.start()
317
+
318
  buffer = ""
319
  for new_text in streamer:
320
+ buffer += new_text.replace("<|im_end|>", "")
 
321
  time.sleep(0.01)
322
+ yield buffer
323
+
324
+ gc.collect()
325
+ if torch.cuda.is_available():
326
+ torch.cuda.empty_cache()
327
+
328
 
329
  @spaces.GPU(duration=calc_timeout_video)
330
+ def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
331
+ if not model_name or model_name not in MODEL_MAP:
332
+ raise gr.Error("Please select a valid model.")
333
+ if not video_path:
334
+ raise gr.Error("Please upload a video.")
335
+ if not text or not str(text).strip():
336
+ raise gr.Error("Please enter your vision/query instruction.")
337
+ if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
338
+ raise gr.Error("Query is too long. Please shorten your input.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ processor, model = MODEL_MAP[model_name]
341
  frames = downsample_video(video_path)
342
+ if not frames:
343
+ raise gr.Error("Failed to read video frames.")
344
+
345
  messages = [
346
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
347
  {"role": "user", "content": [{"type": "text", "text": text}]}
 
350
  image, timestamp = frame
351
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
352
  messages[1]["content"].append({"type": "image", "image": image})
353
+
354
  inputs = processor.apply_chat_template(
355
  messages,
356
  tokenize=True,
 
360
  truncation=True,
361
  max_length=MAX_INPUT_TOKEN_LENGTH
362
  ).to(device)
363
+
364
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
365
  generation_kwargs = {
366
  **inputs,
367
  "streamer": streamer,
368
+ "max_new_tokens": int(max_new_tokens),
369
  "do_sample": True,
370
+ "temperature": float(temperature),
371
+ "top_p": float(top_p),
372
+ "top_k": int(top_k),
373
+ "repetition_penalty": float(repetition_penalty),
374
  }
375
+
376
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
377
  thread.start()
378
+
379
  buffer = ""
380
  for new_text in streamer:
381
+ buffer += new_text.replace("<|im_end|>", "")
 
382
  time.sleep(0.01)
383
+ yield buffer
384
 
385
+ gc.collect()
386
+ if torch.cuda.is_available():
387
+ torch.cuda.empty_cache()
 
 
 
 
388
 
389
+
390
+ def noop():
391
+ return None
392
+
393
+
394
+ css = r"""
395
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
396
+ *{box-sizing:border-box;margin:0;padding:0}
397
+ html,body{height:100%;overflow-x:hidden}
398
+ body,.gradio-container{
399
+ background:#0b1020!important;
400
+ font-family:'Inter',system-ui,-apple-system,sans-serif!important;
401
+ font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
402
+ }
403
+ .dark body,.dark .gradio-container{background:#0b1020!important;color:#e4e4e7!important}
404
+ footer{display:none!important}
405
+ .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
406
+
407
+ #gradio-run-btn,#example-load-btn{
408
+ position:absolute!important;left:-9999px!important;top:-9999px!important;
409
+ width:1px!important;height:1px!important;opacity:0.01!important;
410
+ pointer-events:none!important;overflow:hidden!important;
411
+ }
412
+
413
+ .app-shell{
414
+ background:#11182d;border:1px solid #1e2b52;border-radius:16px;
415
+ margin:12px auto;max-width:1400px;overflow:hidden;
416
+ box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
417
+ }
418
+ .app-header{
419
+ background:linear-gradient(135deg,#11182d,#152042);border-bottom:1px solid #1e2b52;
420
+ padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
421
+ }
422
+ .app-header-left{display:flex;align-items:center;gap:12px}
423
+ .app-logo{
424
+ width:38px;height:38px;background:linear-gradient(135deg,#0000FF,#2e5bff,#6d8dff);
425
+ border-radius:10px;display:flex;align-items:center;justify-content:center;
426
+ box-shadow:0 4px 12px rgba(0,0,255,.35);
427
+ }
428
+ .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
429
+ .app-title{
430
+ font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#b8c5ff);
431
+ -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
432
+ }
433
+ .app-badge{
434
+ font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
435
+ background:rgba(0,0,255,.12);color:#8ea2ff;border:1px solid rgba(0,0,255,.25);letter-spacing:.3px;
436
+ }
437
+ .app-badge.fast{background:rgba(46,91,255,.10);color:#93a7ff;border:1px solid rgba(46,91,255,.22)}
438
+
439
+ .mode-tabs-bar,.model-tabs-bar{
440
+ background:#11182d;border-bottom:1px solid #1e2b52;padding:10px 16px;
441
+ display:flex;gap:8px;align-items:center;flex-wrap:wrap;
442
+ }
443
+ .model-tab,.mode-tab{
444
+ display:inline-flex;align-items:center;justify-content:center;gap:6px;
445
+ min-width:32px;height:34px;background:transparent;border:1px solid #243669;
446
+ border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
447
+ color:#ffffff!important;transition:all .15s ease;
448
+ }
449
+ .model-tab:hover,.mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
450
+ .model-tab.active,.mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
451
+ .model-tab-label,.mode-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
452
+
453
+ .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
454
+ .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #1e2b52}
455
+ .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#11182d}
456
+
457
+ #media-drop-zone{
458
+ position:relative;background:#08101d;height:440px;min-height:440px;max-height:440px;
459
+ overflow:hidden;
460
+ }
461
+ #media-drop-zone.drag-over{outline:2px solid #0000FF;outline-offset:-2px;background:rgba(0,0,255,.04)}
462
+ .upload-prompt-modern{
463
+ position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
464
+ padding:20px;z-index:20;overflow:hidden;
465
+ }
466
+ .upload-click-area{
467
+ display:flex;flex-direction:column;align-items:center;justify-content:center;
468
+ cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
469
+ border:2px dashed #32446d;border-radius:16px;
470
+ background:rgba(0,0,255,.03);transition:all .2s ease;gap:8px;text-align:center;
471
+ overflow:hidden;
472
+ }
473
+ .upload-click-area:hover{background:rgba(0,0,255,.08);border-color:#0000FF;transform:scale(1.02)}
474
+ .upload-click-area:active{background:rgba(0,0,255,.12);transform:scale(.99)}
475
+ .upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0}
476
+ .upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}
477
+ .upload-sub-text{color:#71717a;font-size:12px}
478
+
479
+ .single-preview-wrap{
480
+ width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
481
+ overflow:hidden;
482
+ }
483
+ .single-preview-card{
484
+ width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
485
+ overflow:hidden;border:1px solid #1e2b52;background:#0d1425;
486
+ display:flex;align-items:center;justify-content:center;position:relative;
487
+ }
488
+ .single-preview-card img,.single-preview-card video{
489
+ width:100%;height:100%;max-width:100%;max-height:100%;
490
+ object-fit:contain;display:block;background:#000;
491
+ }
492
+ .preview-overlay-actions{
493
+ position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
494
+ }
495
+ .preview-action-btn{
496
+ display:inline-flex;align-items:center;justify-content:center;
497
+ min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
498
+ border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
499
+ color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
500
+ }
501
+ .preview-action-btn:hover{background:#0000FF;border-color:#0000FF}
502
+
503
+ .hint-bar{
504
+ background:rgba(0,0,255,.06);border-top:1px solid #1e2b52;border-bottom:1px solid #1e2b52;
505
+ padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
506
+ }
507
+ .hint-bar b{color:#8ea2ff;font-weight:600}
508
+ .hint-bar kbd{
509
+ display:inline-block;padding:1px 6px;background:#1b2646;border:1px solid #2d3b6d;
510
+ border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
511
+ }
512
+
513
+ .examples-section{border-top:1px solid #1e2b52;padding:12px 16px}
514
+ .examples-title{
515
+ font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
516
+ letter-spacing:.8px;margin-bottom:10px;
517
+ }
518
+ .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
519
+ .examples-scroll::-webkit-scrollbar{height:6px}
520
+ .examples-scroll::-webkit-scrollbar-track{background:#08101d;border-radius:3px}
521
+ .examples-scroll::-webkit-scrollbar-thumb{background:#243669;border-radius:3px}
522
+ .examples-scroll::-webkit-scrollbar-thumb:hover{background:#38529a}
523
+ .example-card{
524
+ flex-shrink:0;width:220px;background:#08101d;border:1px solid #1e2b52;
525
+ border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
526
+ }
527
+ .example-card:hover{border-color:#0000FF;transform:translateY(-2px);box-shadow:0 4px 12px rgba(0,0,255,.15)}
528
+ .example-card.loading{opacity:.5;pointer-events:none}
529
+ .example-thumb-wrap{height:120px;overflow:hidden;background:#11182d}
530
+ .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
531
+ .example-thumb-placeholder{
532
+ width:100%;height:100%;display:flex;align-items:center;justify-content:center;
533
+ background:#11182d;color:#3f4e78;font-size:11px;
534
+ }
535
+ .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px;flex-wrap:wrap}
536
+ .example-badge{
537
+ display:inline-flex;padding:2px 7px;background:rgba(0,0,255,.12);border-radius:4px;
538
+ font-size:10px;font-weight:600;color:#93a7ff;font-family:'JetBrains Mono',monospace;white-space:nowrap;
539
+ }
540
+ .example-badge.kind{background:rgba(100,130,255,.12);color:#bfd0ff}
541
+ .example-badge.kind.video{background:rgba(0,90,255,.12);color:#a7c4ff}
542
+ .example-prompt-text{
543
+ padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
544
+ display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
545
+ }
546
+
547
+ .panel-card{border-bottom:1px solid #1e2b52}
548
+ .panel-card-title{
549
+ padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
550
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
551
+ }
552
+ .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
553
+ .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
554
+ .modern-textarea{
555
+ width:100%;background:#08101d;border:1px solid #1e2b52;border-radius:8px;
556
+ padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
557
+ resize:none;outline:none;min-height:100px;transition:border-color .2s;
558
+ }
559
+ .modern-textarea:focus{border-color:#0000FF;box-shadow:0 0 0 3px rgba(0,0,255,.15)}
560
+ .modern-textarea::placeholder{color:#4e5d89}
561
+ .modern-textarea.error-flash{
562
+ border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
563
+ }
564
+ @keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}}
565
+
566
+ .toast-notification{
567
+ position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
568
+ z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
569
+ font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
570
+ box-shadow:0 8px 24px rgba(0,0,0,.5);
571
+ transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
572
+ }
573
+ .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
574
+ .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
575
+ .toast-notification.warning{background:linear-gradient(135deg,#1d4ed8,#1e40af);color:#fff;border:1px solid rgba(255,255,255,.15)}
576
+ .toast-notification.info{background:linear-gradient(135deg,#0000FF,#1d4ed8);color:#fff;border:1px solid rgba(255,255,255,.15)}
577
+ .toast-notification .toast-icon{font-size:16px;line-height:1}
578
+ .toast-notification .toast-text{line-height:1.3}
579
+
580
+ .btn-run{
581
+ display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
582
+ background:linear-gradient(135deg,#0000FF,#1d4ed8);border:none;border-radius:10px;
583
+ padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
584
+ color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
585
+ transition:all .2s ease;letter-spacing:-.2px;
586
+ box-shadow:0 4px 16px rgba(0,0,255,.3),inset 0 1px 0 rgba(255,255,255,.1);
587
+ }
588
+ .btn-run:hover{
589
+ background:linear-gradient(135deg,#315cff,#0000FF);transform:translateY(-1px);
590
+ box-shadow:0 6px 24px rgba(0,0,255,.45),inset 0 1px 0 rgba(255,255,255,.15);
591
+ }
592
+ .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(0,0,255,.3)}
593
+ #custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{
594
+ color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
595
+ }
596
+
597
+ .output-frame{border-bottom:1px solid #1e2b52;display:flex;flex-direction:column;position:relative}
598
+ .output-frame .out-title,
599
+ .output-frame .out-title *,
600
+ #output-title-label{
601
+ color:#ffffff!important;
602
+ -webkit-text-fill-color:#ffffff!important;
603
+ }
604
+ .output-frame .out-title{
605
+ padding:10px 20px;font-size:13px;font-weight:700;
606
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
607
+ display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
608
+ }
609
+ .out-title-right{display:flex;gap:8px;align-items:center}
610
+ .out-action-btn{
611
+ display:inline-flex;align-items:center;justify-content:center;background:rgba(0,0,255,.1);
612
+ border:1px solid rgba(0,0,255,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
613
+ font-size:11px;font-weight:500;color:#93a7ff!important;gap:4px;height:24px;transition:all .15s;
614
+ }
615
+ .out-action-btn:hover{background:rgba(0,0,255,.2);border-color:rgba(0,0,255,.35);color:#ffffff!important}
616
+ .out-action-btn svg{width:12px;height:12px;fill:#93a7ff}
617
+ .output-frame .out-body{
618
+ flex:1;background:#08101d;display:flex;align-items:stretch;justify-content:stretch;
619
+ overflow:hidden;min-height:320px;position:relative;
620
+ }
621
+ .output-scroll-wrap{
622
+ width:100%;height:100%;padding:0;overflow:hidden;
623
+ }
624
+ .output-textarea{
625
+ width:100%;height:320px;min-height:320px;max-height:320px;background:#08101d;color:#e4e4e7;
626
+ border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
627
+ font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
628
+ }
629
+ .output-textarea::placeholder{color:#5f6d96}
630
+ .output-textarea.error-flash{
631
+ box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
632
+ }
633
+ .modern-loader{
634
+ display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(8,16,29,.92);
635
+ z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
636
+ }
637
+ .modern-loader.active{display:flex}
638
+ .modern-loader .loader-spinner{
639
+ width:36px;height:36px;border:3px solid #243669;border-top-color:#0000FF;
640
+ border-radius:50%;animation:spin .8s linear infinite;
641
+ }
642
+ @keyframes spin{to{transform:rotate(360deg)}}
643
+ .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
644
+ .loader-bar-track{width:200px;height:4px;background:#243669;border-radius:2px;overflow:hidden}
645
+ .loader-bar-fill{
646
+ height:100%;background:linear-gradient(90deg,#0000FF,#4b74ff,#0000FF);
647
+ background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
648
+ }
649
+ @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
650
+
651
+ .settings-group{border:1px solid #1e2b52;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
652
+ .settings-group-title{
653
+ font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
654
+ padding:10px 16px;border-bottom:1px solid #1e2b52;background:rgba(17,24,45,.5);
655
+ }
656
+ .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
657
+ .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
658
+ .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
659
+ .slider-row input[type="range"]{
660
+ flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#243669;
661
+ border-radius:3px;outline:none;min-width:0;
662
+ }
663
+ .slider-row input[type="range"]::-webkit-slider-thumb{
664
+ -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
665
+ border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(0,0,255,.4);transition:transform .15s;
666
+ }
667
+ .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
668
+ .slider-row input[type="range"]::-moz-range-thumb{
669
+ width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
670
+ border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(0,0,255,.4);
671
+ }
672
+ .slider-row .slider-val{
673
+ min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
674
+ font-weight:500;padding:3px 8px;background:#08101d;border:1px solid #1e2b52;
675
+ border-radius:6px;color:#a1a1aa;flex-shrink:0;
676
+ }
677
+
678
+ .app-statusbar{
679
+ background:#11182d;border-top:1px solid #1e2b52;padding:6px 20px;
680
+ display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
681
+ }
682
+ .app-statusbar .sb-section{
683
+ padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
684
+ font-size:12px;color:#6b7cae;overflow:hidden;white-space:nowrap;
685
+ }
686
+ .app-statusbar .sb-section.sb-fixed{
687
+ flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
688
+ padding:3px 12px;background:rgba(0,0,255,.08);border-radius:6px;color:#93a7ff;font-weight:500;
689
+ }
690
+
691
+ .exp-note{padding:10px 20px;font-size:12px;color:#6b7cae;border-top:1px solid #1e2b52;text-align:center}
692
+ .exp-note a{color:#93a7ff;text-decoration:none}
693
+ .exp-note a:hover{text-decoration:underline}
694
+
695
+ ::-webkit-scrollbar{width:8px;height:8px}
696
+ ::-webkit-scrollbar-track{background:#08101d}
697
+ ::-webkit-scrollbar-thumb{background:#243669;border-radius:4px}
698
+ ::-webkit-scrollbar-thumb:hover{background:#38529a}
699
+
700
+ @media(max-width:980px){
701
+ .app-main-row{flex-direction:column}
702
+ .app-main-right{width:100%}
703
+ .app-main-left{border-right:none;border-bottom:1px solid #1e2b52}
704
+ }
705
+ """
706
+
707
+ gallery_js = r"""
708
+ () => {
709
+ function init() {
710
+ if (window.__visionScopeInitDone) return;
711
+
712
+ const dropZone = document.getElementById('media-drop-zone');
713
+ const uploadPrompt = document.getElementById('upload-prompt');
714
+ const uploadClick = document.getElementById('upload-click-area');
715
+ const fileInput = document.getElementById('custom-file-input');
716
+ const previewWrap = document.getElementById('single-preview-wrap');
717
+ const previewImg = document.getElementById('single-preview-img');
718
+ const previewVideo = document.getElementById('single-preview-video');
719
+ const btnUpload = document.getElementById('preview-upload-btn');
720
+ const btnClear = document.getElementById('preview-clear-btn');
721
+ const promptInput = document.getElementById('custom-query-input');
722
+ const runBtnEl = document.getElementById('custom-run-btn');
723
+ const outputArea = document.getElementById('custom-output-textarea');
724
+ const mediaStatus = document.getElementById('sb-media-status');
725
+ const exampleResultContainer = document.getElementById('example-result-data');
726
+
727
+ if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
728
+ setTimeout(init, 250);
729
+ return;
730
+ }
731
+
732
+ window.__visionScopeInitDone = true;
733
+ let fileState = null;
734
+ let toastTimer = null;
735
+
736
+ function showToast(message, type) {
737
+ let toast = document.getElementById('app-toast');
738
+ if (!toast) {
739
+ toast = document.createElement('div');
740
+ toast.id = 'app-toast';
741
+ toast.className = 'toast-notification';
742
+ toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>';
743
+ document.body.appendChild(toast);
744
+ }
745
+ const icon = toast.querySelector('.toast-icon');
746
+ const text = toast.querySelector('.toast-text');
747
+ toast.className = 'toast-notification ' + (type || 'error');
748
+ if (type === 'warning') icon.textContent = '\u26A0';
749
+ else if (type === 'info') icon.textContent = '\u2139';
750
+ else icon.textContent = '\u2717';
751
+ text.textContent = message;
752
+ if (toastTimer) clearTimeout(toastTimer);
753
+ void toast.offsetWidth;
754
+ toast.classList.add('visible');
755
+ toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
756
+ }
757
+ window.__showToast = showToast;
758
+
759
+ function showLoader() {
760
+ const l = document.getElementById('output-loader');
761
+ if (l) l.classList.add('active');
762
+ const sb = document.getElementById('sb-run-state');
763
+ if (sb) sb.textContent = 'Processing...';
764
+ }
765
+ function hideLoader() {
766
+ const l = document.getElementById('output-loader');
767
+ if (l) l.classList.remove('active');
768
+ const sb = document.getElementById('sb-run-state');
769
+ if (sb) sb.textContent = 'Done';
770
+ }
771
+ window.__showLoader = showLoader;
772
+ window.__hideLoader = hideLoader;
773
+
774
+ function flashPromptError() {
775
+ promptInput.classList.add('error-flash');
776
+ promptInput.focus();
777
+ setTimeout(() => promptInput.classList.remove('error-flash'), 800);
778
+ }
779
+
780
+ function flashOutputError() {
781
+ if (!outputArea) return;
782
+ outputArea.classList.add('error-flash');
783
+ setTimeout(() => outputArea.classList.remove('error-flash'), 800);
784
+ }
785
+
786
+ function setGradioValue(containerId, value) {
787
+ const container = document.getElementById(containerId);
788
+ if (!container) return;
789
+ container.querySelectorAll('input, textarea').forEach(el => {
790
+ if (el.type === 'file' || el.type === 'range' || el.type === 'checkbox') return;
791
+ const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
792
+ const ns = Object.getOwnPropertyDescriptor(proto, 'value');
793
+ if (ns && ns.set) {
794
+ ns.set.call(el, value);
795
+ el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
796
+ el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
797
+ }
798
+ });
799
+ }
800
+
801
+ function syncFileToGradio() {
802
+ setGradioValue('hidden-file-b64', fileState ? fileState.b64 : '');
803
+ setGradioValue('hidden-input-kind', fileState ? fileState.kind : getActiveMode());
804
+ const txt = fileState ? ('1 ' + fileState.kind + ' uploaded') : ('No ' + getActiveMode() + ' uploaded');
805
+ if (mediaStatus) mediaStatus.textContent = txt;
806
+ }
807
+
808
+ function syncPromptToGradio() {
809
+ setGradioValue('prompt-gradio-input', promptInput.value);
810
+ }
811
+
812
+ function syncModelToGradio(name) {
813
+ setGradioValue('hidden-model-name', name);
814
+ }
815
+
816
+ function syncModeToGradio(mode) {
817
+ setGradioValue('hidden-mode-name', mode);
818
+ setGradioValue('hidden-input-kind', fileState ? fileState.kind : mode);
819
+ const sub = document.getElementById('upload-sub-text');
820
+ const main = document.getElementById('upload-main-text');
821
+ if (mode === 'video') {
822
+ if (main) main.textContent = 'Click or drag a video here';
823
+ if (sub) sub.textContent = 'Upload one short video clip for multimodal video understanding';
824
+ } else {
825
+ if (main) main.textContent = 'Click or drag an image here';
826
+ if (sub) sub.textContent = 'Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks';
827
+ }
828
+ if (!fileState && mediaStatus) mediaStatus.textContent = 'No ' + mode + ' uploaded';
829
+ }
830
+
831
+ function getActiveMode() {
832
+ const active = document.querySelector('.mode-tab.active');
833
+ return active ? active.getAttribute('data-mode') : 'image';
834
+ }
835
+
836
+ function activateModeTab(name) {
837
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
838
+ btn.classList.toggle('active', btn.getAttribute('data-mode') === name);
839
+ });
840
+ syncModeToGradio(name);
841
+ if (fileState && fileState.kind !== name) {
842
+ clearPreview();
843
+ }
844
+ }
845
+ window.__activateModeTab = activateModeTab;
846
+
847
+ function activateModelTab(name) {
848
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
849
+ btn.classList.toggle('active', btn.getAttribute('data-model') === name);
850
+ });
851
+ syncModelToGradio(name);
852
+ }
853
+ window.__activateModelTab = activateModelTab;
854
+
855
+ function setPreview(kind, b64, name) {
856
+ fileState = {kind, b64, name: name || kind};
857
+ if (kind === 'video') {
858
+ previewVideo.src = b64;
859
+ previewVideo.style.display = 'block';
860
+ previewImg.style.display = 'none';
861
+ previewVideo.load();
862
+ } else {
863
+ previewImg.src = b64;
864
+ previewImg.style.display = 'block';
865
+ previewVideo.style.display = 'none';
866
+ previewVideo.removeAttribute('src');
867
+ }
868
+ previewWrap.style.display = 'flex';
869
+ if (uploadPrompt) uploadPrompt.style.display = 'none';
870
+ activateModeTab(kind);
871
+ syncFileToGradio();
872
+ }
873
+ window.__setPreview = setPreview;
874
+
875
+ function clearPreview() {
876
+ fileState = null;
877
+ previewImg.src = '';
878
+ previewVideo.pause();
879
+ previewVideo.removeAttribute('src');
880
+ previewVideo.load();
881
+ previewWrap.style.display = 'none';
882
+ if (uploadPrompt) uploadPrompt.style.display = 'flex';
883
+ syncFileToGradio();
884
+ }
885
+ window.__clearPreview = clearPreview;
886
+
887
+ function processFile(file) {
888
+ if (!file) return;
889
+ const mode = getActiveMode();
890
+
891
+ if (mode === 'image' && !file.type.startsWith('image/')) {
892
+ showToast('Only image files are supported in Image mode', 'error');
893
+ return;
894
+ }
895
+ if (mode === 'video' && !file.type.startsWith('video/')) {
896
+ showToast('Only video files are supported in Video mode', 'error');
897
+ return;
898
+ }
899
+
900
+ const reader = new FileReader();
901
+ reader.onload = (e) => setPreview(mode, e.target.result, file.name);
902
+ reader.readAsDataURL(file);
903
+ }
904
+
905
+ fileInput.addEventListener('change', (e) => {
906
+ const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
907
+ if (file) processFile(file);
908
+ e.target.value = '';
909
+ });
910
+
911
+ if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
912
+ if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
913
+ if (btnClear) btnClear.addEventListener('click', clearPreview);
914
+
915
+ dropZone.addEventListener('dragover', (e) => {
916
+ e.preventDefault();
917
+ dropZone.classList.add('drag-over');
918
+ });
919
+ dropZone.addEventListener('dragleave', (e) => {
920
+ e.preventDefault();
921
+ dropZone.classList.remove('drag-over');
922
+ });
923
+ dropZone.addEventListener('drop', (e) => {
924
+ e.preventDefault();
925
+ dropZone.classList.remove('drag-over');
926
+ if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
927
+ });
928
+
929
+ promptInput.addEventListener('input', syncPromptToGradio);
930
+
931
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
932
+ btn.addEventListener('click', () => {
933
+ const model = btn.getAttribute('data-model');
934
+ activateModelTab(model);
935
+ });
936
+ });
937
+
938
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
939
+ btn.addEventListener('click', () => {
940
+ const mode = btn.getAttribute('data-mode');
941
+ activateModeTab(mode);
942
+ });
943
+ });
944
+
945
+ activateModelTab('DeepCaption-VLA-7B');
946
+ activateModeTab('image');
947
+
948
+ function syncSlider(customId, gradioId) {
949
+ const slider = document.getElementById(customId);
950
+ const valSpan = document.getElementById(customId + '-val');
951
+ if (!slider) return;
952
+ slider.addEventListener('input', () => {
953
+ if (valSpan) valSpan.textContent = slider.value;
954
+ const container = document.getElementById(gradioId);
955
+ if (!container) return;
956
+ container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
957
+ const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
958
+ if (ns && ns.set) {
959
+ ns.set.call(el, slider.value);
960
+ el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
961
+ el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
962
+ }
963
+ });
964
+ });
965
+ }
966
+
967
+ syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens');
968
+ syncSlider('custom-temperature', 'gradio-temperature');
969
+ syncSlider('custom-top-p', 'gradio-top-p');
970
+ syncSlider('custom-top-k', 'gradio-top-k');
971
+ syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty');
972
+ syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
973
+
974
+ function validateBeforeRun() {
975
+ const promptVal = promptInput.value.trim();
976
+ const currentMode = getActiveMode();
977
+
978
+ if (!fileState && !promptVal) {
979
+ showToast('Please upload a file and enter your instruction', 'error');
980
+ flashPromptError();
981
+ return false;
982
+ }
983
+ if (!fileState) {
984
+ showToast('Please upload a ' + currentMode, 'error');
985
+ return false;
986
+ }
987
+ if (!promptVal) {
988
+ showToast('Please enter your vision/query instruction', 'warning');
989
+ flashPromptError();
990
+ return false;
991
+ }
992
+ if (fileState.kind !== currentMode) {
993
+ showToast('Uploaded file type does not match active mode', 'error');
994
+ return false;
995
+ }
996
+ const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
997
+ if (!currentModel) {
998
+ showToast('Please select a model', 'error');
999
+ return false;
1000
+ }
1001
+ return true;
1002
+ }
1003
+
1004
+ window.__clickGradioRunBtn = function() {
1005
+ if (!validateBeforeRun()) return;
1006
+ syncPromptToGradio();
1007
+ syncFileToGradio();
1008
+ const activeModel = document.querySelector('.model-tab.active');
1009
+ if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1010
+ syncModeToGradio(getActiveMode());
1011
+ if (outputArea) outputArea.value = '';
1012
+ showLoader();
1013
+ setTimeout(() => {
1014
+ const gradioBtn = document.getElementById('gradio-run-btn');
1015
+ if (!gradioBtn) return;
1016
+ const btn = gradioBtn.querySelector('button');
1017
+ if (btn) btn.click(); else gradioBtn.click();
1018
+ }, 180);
1019
+ };
1020
+
1021
+ if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
1022
+
1023
+ const copyBtn = document.getElementById('copy-output-btn');
1024
+ if (copyBtn) {
1025
+ copyBtn.addEventListener('click', async () => {
1026
+ try {
1027
+ const text = outputArea ? outputArea.value : '';
1028
+ if (!text.trim()) {
1029
+ showToast('No output to copy', 'warning');
1030
+ return;
1031
+ }
1032
+ await navigator.clipboard.writeText(text);
1033
+ showToast('Output copied to clipboard', 'info');
1034
+ } catch(e) {
1035
+ showToast('Copy failed', 'error');
1036
+ }
1037
+ });
1038
+ }
1039
+
1040
+ const saveBtn = document.getElementById('save-output-btn');
1041
+ if (saveBtn) {
1042
+ saveBtn.addEventListener('click', () => {
1043
+ const text = outputArea ? outputArea.value : '';
1044
+ if (!text.trim()) {
1045
+ showToast('No output to save', 'warning');
1046
+ flashOutputError();
1047
+ return;
1048
+ }
1049
+ const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
1050
+ const a = document.createElement('a');
1051
+ a.href = URL.createObjectURL(blob);
1052
+ a.download = 'visionscope_r2_output.txt';
1053
+ document.body.appendChild(a);
1054
+ a.click();
1055
+ setTimeout(() => {
1056
+ URL.revokeObjectURL(a.href);
1057
+ document.body.removeChild(a);
1058
+ }, 200);
1059
+ showToast('Output saved', 'info');
1060
+ });
1061
+ }
1062
+
1063
+ document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1064
+ card.addEventListener('click', () => {
1065
+ const idx = card.getAttribute('data-idx');
1066
+ const kind = card.getAttribute('data-kind');
1067
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1068
+ card.classList.add('loading');
1069
+ showToast('Loading example...', 'info');
1070
+ setGradioValue('example-result-data', '');
1071
+ setGradioValue('example-kind-input', kind);
1072
+ setGradioValue('example-idx-input', idx);
1073
+ setTimeout(() => {
1074
+ const btn = document.getElementById('example-load-btn');
1075
+ if (btn) {
1076
+ const b = btn.querySelector('button');
1077
+ if (b) b.click(); else btn.click();
1078
+ }
1079
+ }, 150);
1080
+ setTimeout(() => card.classList.remove('loading'), 12000);
1081
+ });
1082
+ });
1083
+
1084
+ function checkExampleResult() {
1085
+ if (!exampleResultContainer) return;
1086
+ const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
1087
+ if (!el || !el.value) return;
1088
+ if (window.__lastExampleVal === el.value) return;
1089
+ try {
1090
+ const data = JSON.parse(el.value);
1091
+ if (data.status === 'ok') {
1092
+ window.__lastExampleVal = el.value;
1093
+ if (data.file && data.kind) setPreview(data.kind, data.file, data.name || 'example');
1094
+ if (data.query) {
1095
+ promptInput.value = data.query;
1096
+ syncPromptToGradio();
1097
+ }
1098
+ if (data.model) activateModelTab(data.model);
1099
+ if (data.kind) activateModeTab(data.kind);
1100
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1101
+ showToast('Example loaded', 'info');
1102
+ } else if (data.status === 'error') {
1103
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1104
+ showToast(data.message || 'Failed to load example', 'error');
1105
+ }
1106
+ } catch(e) {}
1107
+ }
1108
+
1109
+ const obsExample = new MutationObserver(checkExampleResult);
1110
+ if (exampleResultContainer) {
1111
+ obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1112
+ }
1113
+ setInterval(checkExampleResult, 500);
1114
+
1115
+ if (outputArea) outputArea.value = '';
1116
+ const sb = document.getElementById('sb-run-state');
1117
+ if (sb) sb.textContent = 'Ready';
1118
+ if (mediaStatus) mediaStatus.textContent = 'No image uploaded';
1119
+ }
1120
+ init();
1121
+ }
1122
+ """
1123
+
1124
+ wire_outputs_js = r"""
1125
+ () => {
1126
+ function watchOutputs() {
1127
+ const resultContainer = document.getElementById('gradio-result');
1128
+ const outArea = document.getElementById('custom-output-textarea');
1129
+ if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
1130
+
1131
+ let lastText = '';
1132
+
1133
+ function syncOutput() {
1134
+ const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
1135
+ if (!el) return;
1136
+ const val = el.value || '';
1137
+ if (val !== lastText) {
1138
+ lastText = val;
1139
+ outArea.value = val;
1140
+ outArea.scrollTop = outArea.scrollHeight;
1141
+ if (window.__hideLoader && val.trim()) window.__hideLoader();
1142
+ }
1143
+ }
1144
+
1145
+ const observer = new MutationObserver(syncOutput);
1146
+ observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1147
+ setInterval(syncOutput, 500);
1148
+ }
1149
+ watchOutputs();
1150
+ }
1151
+ """
1152
+
1153
+ APP_LOGO_SVG = """
1154
+ <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1155
+ <path d="M12 2 4 6v6c0 5 3.4 9.4 8 10 4.6-.6 8-5 8-10V6l-8-4Z" fill="white"/>
1156
+ <path d="M9 11.5 11 13.5 15.5 9" stroke="#0000FF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
1157
+ </svg>
1158
+ """
1159
+
1160
+ UPLOAD_PREVIEW_SVG = """
1161
+ <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
1162
+ <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#0000FF" stroke-width="2" stroke-dasharray="4 3"/>
1163
+ <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(0,0,255,0.15)" stroke="#0000FF" stroke-width="1.5"/>
1164
+ <circle cx="28" cy="30" r="6" fill="rgba(0,0,255,0.2)" stroke="#0000FF" stroke-width="1.5"/>
1165
+ </svg>
1166
+ """
1167
+
1168
+ COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>"""
1169
+ SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>"""
1170
+
1171
+ MODEL_TABS_HTML = "".join([
1172
+ f'<button class="model-tab{" active" if m == "DeepCaption-VLA-7B" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>'
1173
+ for m in MODEL_CHOICES
1174
+ ])
1175
+
1176
+ MODE_TABS_HTML = """
1177
+ <button class="mode-tab active" data-mode="image"><span class="mode-tab-label">Image Inference</span></button>
1178
+ <button class="mode-tab" data-mode="video"><span class="mode-tab-label">Video Inference</span></button>
1179
+ """
1180
 
1181
  with gr.Blocks() as demo:
1182
+ hidden_file_b64 = gr.Textbox(value="", elem_id="hidden-file-b64", elem_classes="hidden-input", container=False)
1183
+ hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1184
+ hidden_input_kind = gr.Textbox(value="image", elem_id="hidden-input-kind", elem_classes="hidden-input", container=False)
1185
+ prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1186
+ hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1187
+
1188
+ max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False)
1189
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.6, elem_id="gradio-temperature", elem_classes="hidden-input", container=False)
1190
+ top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False)
1191
+ top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False)
1192
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False)
1193
+ gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
1194
+
1195
+ result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1196
+
1197
+ example_kind = gr.Textbox(value="", elem_id="example-kind-input", elem_classes="hidden-input", container=False)
1198
+ example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1199
+ example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1200
+ example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
1201
+
1202
+ gr.HTML(f"""
1203
+ <div class="app-shell">
1204
+ <div class="app-header">
1205
+ <div class="app-header-left">
1206
+ <div class="app-logo">{APP_LOGO_SVG}</div>
1207
+ <span class="app-title">VisionScope R2</span>
1208
+ <span class="app-badge">vision enabled</span>
1209
+ <span class="app-badge fast">Blue Suite</span>
1210
+ </div>
1211
+ </div>
1212
+
1213
+ <div class="mode-tabs-bar">
1214
+ {MODE_TABS_HTML}
1215
+ </div>
1216
+
1217
+ <div class="model-tabs-bar">
1218
+ {MODEL_TABS_HTML}
1219
+ </div>
1220
+
1221
+ <div class="app-main-row">
1222
+ <div class="app-main-left">
1223
+ <div id="media-drop-zone">
1224
+ <div id="upload-prompt" class="upload-prompt-modern">
1225
+ <div id="upload-click-area" class="upload-click-area">
1226
+ {UPLOAD_PREVIEW_SVG}
1227
+ <span id="upload-main-text" class="upload-main-text">Click or drag an image here</span>
1228
+ <span id="upload-sub-text" class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks</span>
1229
+ </div>
1230
+ </div>
1231
+
1232
+ <input id="custom-file-input" type="file" accept="image/*,video/*" style="display:none;" />
1233
+
1234
+ <div id="single-preview-wrap" class="single-preview-wrap">
1235
+ <div class="single-preview-card">
1236
+ <img id="single-preview-img" src="" alt="Preview" style="display:none;">
1237
+ <video id="single-preview-video" controls playsinline style="display:none;"></video>
1238
+ <div class="preview-overlay-actions">
1239
+ <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1240
+ <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
1241
+ </div>
1242
+ </div>
1243
+ </div>
1244
+ </div>
1245
+
1246
+ <div class="hint-bar">
1247
+ <b>Upload:</b> Click or drag to add an image or video &nbsp;&middot;&nbsp;
1248
+ <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1249
+ <b>Model:</b> Choose model tabs from the header
1250
+ </div>
1251
+
1252
+ <div class="examples-section">
1253
+ <div class="examples-title">Quick Examples</div>
1254
+ <div class="examples-scroll">
1255
+ {EXAMPLE_CARDS_HTML}
1256
+ </div>
1257
+ </div>
1258
+ </div>
1259
+
1260
+ <div class="app-main-right">
1261
+ <div class="panel-card">
1262
+ <div class="panel-card-title">Vision Instruction</div>
1263
+ <div class="panel-card-body">
1264
+ <label class="modern-label" for="custom-query-input">Query Input</label>
1265
+ <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., extract the text, describe the image, explain the scene, summarize the video, count objects, estimate distance..."></textarea>
1266
+ </div>
1267
+ </div>
1268
+
1269
+ <div style="padding:12px 20px;">
1270
+ <button id="custom-run-btn" class="btn-run">
1271
+ <span id="run-btn-label">Run Vision</span>
1272
+ </button>
1273
+ </div>
1274
+
1275
+ <div class="output-frame">
1276
+ <div class="out-title">
1277
+ <span id="output-title-label">Raw Output Stream</span>
1278
+ <div class="out-title-right">
1279
+ <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button>
1280
+ <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button>
1281
+ </div>
1282
+ </div>
1283
+ <div class="out-body">
1284
+ <div class="modern-loader" id="output-loader">
1285
+ <div class="loader-spinner"></div>
1286
+ <div class="loader-text">Running vision inference...</div>
1287
+ <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1288
+ </div>
1289
+ <div class="output-scroll-wrap">
1290
+ <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea>
1291
+ </div>
1292
+ </div>
1293
+ </div>
1294
+
1295
+ <div class="settings-group">
1296
+ <div class="settings-group-title">Advanced Settings</div>
1297
+ <div class="settings-group-body">
1298
+ <div class="slider-row">
1299
+ <label>Max new tokens</label>
1300
+ <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}">
1301
+ <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span>
1302
+ </div>
1303
+ <div class="slider-row">
1304
+ <label>Temperature</label>
1305
+ <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.6">
1306
+ <span class="slider-val" id="custom-temperature-val">0.6</span>
1307
+ </div>
1308
+ <div class="slider-row">
1309
+ <label>Top-p</label>
1310
+ <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9">
1311
+ <span class="slider-val" id="custom-top-p-val">0.9</span>
1312
+ </div>
1313
+ <div class="slider-row">
1314
+ <label>Top-k</label>
1315
+ <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50">
1316
+ <span class="slider-val" id="custom-top-k-val">50</span>
1317
+ </div>
1318
+ <div class="slider-row">
1319
+ <label>Repetition penalty</label>
1320
+ <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.2">
1321
+ <span class="slider-val" id="custom-repetition-penalty-val">1.2</span>
1322
+ </div>
1323
+ <div class="slider-row">
1324
+ <label>GPU Duration (seconds)</label>
1325
+ <input type="range" id="custom-gpu-duration" min="60" max="300" step="30" value="60">
1326
+ <span class="slider-val" id="custom-gpu-duration-val">60</span>
1327
+ </div>
1328
+ </div>
1329
+ </div>
1330
+ </div>
1331
+ </div>
1332
+
1333
+ <div class="exp-note">
1334
+ Experimental Vision Suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/VisionScope-R2" target="_blank">GitHub</a>
1335
+ </div>
1336
+
1337
+ <div class="app-statusbar">
1338
+ <div class="sb-section" id="sb-media-status">No image uploaded</div>
1339
+ <div class="sb-section sb-fixed" id="sb-run-state">Ready</div>
1340
+ </div>
1341
+ </div>
1342
+ """)
1343
+
1344
+ run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1345
+
1346
+ def b64_to_pil(b64_str):
1347
+ if not b64_str:
1348
+ return None
1349
+ try:
1350
+ if b64_str.startswith("data:image"):
1351
+ _, data = b64_str.split(",", 1)
1352
+ else:
1353
+ data = b64_str
1354
+ image_data = base64.b64decode(data)
1355
+ return Image.open(BytesIO(image_data)).convert("RGB")
1356
+ except Exception:
1357
+ return None
1358
+
1359
+ def b64_to_temp_video(b64_str):
1360
+ if not b64_str:
1361
+ return None
1362
+ try:
1363
+ header, data = b64_str.split(",", 1) if "," in b64_str else ("", b64_str)
1364
+ os.makedirs("/tmp/visionscope_r2", exist_ok=True)
1365
+ ext = ".mp4"
1366
+ if "video/webm" in header:
1367
+ ext = ".webm"
1368
+ elif "video/quicktime" in header:
1369
+ ext = ".mov"
1370
+ elif "video/x-matroska" in header:
1371
+ ext = ".mkv"
1372
+ path = f"/tmp/visionscope_r2/{uuid.uuid4().hex}{ext}"
1373
+ with open(path, "wb") as f:
1374
+ f.write(base64.b64decode(data))
1375
+ return path
1376
+ except Exception:
1377
+ return None
1378
+
1379
+ def run_vision(mode_name, model_name, text, file_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
1380
+ if mode_name == "video":
1381
+ temp_video_path = b64_to_temp_video(file_b64)
1382
+ if not temp_video_path:
1383
+ raise gr.Error("Failed to decode uploaded video.")
1384
+ try:
1385
+ yield from generate_video(
1386
+ model_name=model_name,
1387
+ text=text,
1388
+ video_path=temp_video_path,
1389
+ max_new_tokens=max_new_tokens_v,
1390
+ temperature=temperature_v,
1391
+ top_p=top_p_v,
1392
+ top_k=top_k_v,
1393
+ repetition_penalty=repetition_penalty_v,
1394
+ gpu_timeout=gpu_timeout_v,
1395
+ )
1396
+ finally:
1397
+ try:
1398
+ if os.path.exists(temp_video_path):
1399
+ os.remove(temp_video_path)
1400
+ except Exception:
1401
+ pass
1402
+ else:
1403
+ image = b64_to_pil(file_b64)
1404
+ yield from generate_image(
1405
+ model_name=model_name,
1406
+ text=text,
1407
+ image=image,
1408
+ max_new_tokens=max_new_tokens_v,
1409
+ temperature=temperature_v,
1410
+ top_p=top_p_v,
1411
+ top_k=top_k_v,
1412
+ repetition_penalty=repetition_penalty_v,
1413
+ gpu_timeout=gpu_timeout_v,
1414
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1415
 
1416
+ demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1417
+ demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1418
+
1419
+ run_btn.click(
1420
+ fn=run_vision,
1421
+ inputs=[
1422
+ hidden_mode_name,
1423
+ hidden_model_name,
1424
+ prompt,
1425
+ hidden_file_b64,
1426
+ max_new_tokens,
1427
+ temperature,
1428
+ top_p,
1429
+ top_k,
1430
+ repetition_penalty,
1431
+ gpu_duration_state,
1432
+ ],
1433
+ outputs=[result],
1434
+ js=r"""(mode, model, p, filev, mnt, t, tp, tk, rp, gd) => {
1435
+ const modelEl = document.querySelector('.model-tab.active');
1436
+ const modeEl = document.querySelector('.mode-tab.active');
1437
+ const chosenModel = modelEl ? modelEl.getAttribute('data-model') : model;
1438
+ const chosenMode = modeEl ? modeEl.getAttribute('data-mode') : mode;
1439
+ const promptEl = document.getElementById('custom-query-input');
1440
+ const promptVal = promptEl ? promptEl.value : p;
1441
+ const fileContainer = document.getElementById('hidden-file-b64');
1442
+ let fileVal = filev;
1443
+ if (fileContainer) {
1444
+ const inner = fileContainer.querySelector('textarea, input');
1445
+ if (inner) fileVal = inner.value;
1446
+ }
1447
+ return [chosenMode, chosenModel, promptVal, fileVal, mnt, t, tp, tk, rp, gd];
1448
+ }""",
1449
  )
1450
+
1451
+ example_load_btn.click(
1452
+ fn=load_example_data,
1453
+ inputs=[example_kind, example_idx],
1454
+ outputs=[example_result],
1455
+ queue=False,
1456
  )
1457
 
1458
  if __name__ == "__main__":
1459
+ demo.queue(max_size=50).launch(
1460
+ css=css,
1461
+ mcp_server=True,
1462
+ ssr_mode=False,
1463
+ show_error=True,
1464
+ allowed_paths=["images", "videos"],
1465
+ )