prithivMLmods commited on
Commit
2126dd7
·
verified ·
1 Parent(s): 3d21151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +362 -358
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import os
2
  import gc
3
  import re
4
- import ast
5
  import json
6
- import base64
7
- import random
8
  import uuid
9
  import time
 
 
10
  from io import BytesIO
11
  from threading import Thread
12
 
@@ -31,6 +30,16 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
  print("Using device:", device)
33
 
 
 
 
 
 
 
 
 
 
 
34
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
35
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
36
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -83,21 +92,24 @@ MODEL_MAP = {
83
  "coreOCR-7B-050325-preview": (processor_k, model_k),
84
  "SpaceOm-3B": (processor_y, model_y),
85
  }
 
86
  MODEL_CHOICES = list(MODEL_MAP.keys())
87
 
88
  image_examples = [
89
- {"query": "type out the messy hand-writing as accurately as you can.", "image": "images/1.jpg", "model": "coreOCR-7B-050325-preview"},
90
- {"query": "count the number of birds and explain the scene in detail.", "image": "images/2.jpeg", "model": "DeepCaption-VLA-7B"},
91
- {"query": "how far is the Goal from the penalty taker in this image?.", "image": "images/3.png", "model": "SpaceThinker-3B"},
92
- {"query": "approximately how many meters apart are the chair and bookshelf?.", "image": "images/4.png", "model": "SkyCaptioner-V1"},
93
- {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "image": "images/5.jpg", "model": "SpaceOm-3B"},
94
  ]
95
 
96
  video_examples = [
97
- {"query": "give the highlights of the movie scene video.", "video": "videos/1.mp4", "model": "DeepCaption-VLA-7B"},
98
- {"query": "explain the advertisement in detail.", "video": "videos/2.mp4", "model": "SkyCaptioner-V1"},
99
  ]
100
 
 
 
101
 
102
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
103
  buf = BytesIO()
@@ -119,16 +131,24 @@ def file_to_data_url(path):
119
  "mp4": "video/mp4",
120
  "mov": "video/quicktime",
121
  "webm": "video/webm",
122
- "mkv": "video/x-matroska",
123
  }.get(ext, "application/octet-stream")
124
  with open(path, "rb") as f:
125
  data = base64.b64encode(f.read()).decode()
126
  return f"data:{mime};base64,{data}"
127
 
128
 
129
- def make_thumb_b64(path, max_dim=240):
130
  try:
131
- img = Image.open(path).convert("RGB")
 
 
 
 
 
 
 
 
 
132
  img.thumbnail((max_dim, max_dim))
133
  return pil_to_data_url(img, "JPEG")
134
  except Exception as e:
@@ -136,53 +156,20 @@ def make_thumb_b64(path, max_dim=240):
136
  return ""
137
 
138
 
139
- def make_video_thumb_b64(path, max_dim=240):
140
- try:
141
- cap = cv2.VideoCapture(path)
142
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
143
- target = max(0, total_frames // 2)
144
- cap.set(cv2.CAP_PROP_POS_FRAMES, target)
145
- success, frame = cap.read()
146
- cap.release()
147
- if not success:
148
- return ""
149
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
150
- img = Image.fromarray(frame).convert("RGB")
151
- img.thumbnail((max_dim, max_dim))
152
- return pil_to_data_url(img, "JPEG")
153
- except Exception as e:
154
- print("Video thumbnail error:", e)
155
- return ""
156
-
157
-
158
  def build_example_cards_html():
159
  cards = ""
160
- for i, ex in enumerate(image_examples):
161
- thumb = make_thumb_b64(ex["image"])
162
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
 
163
  cards += f"""
164
- <div class="example-card" data-kind="image" data-idx="{i}">
165
  <div class="example-thumb-wrap">
166
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
 
167
  </div>
168
  <div class="example-meta-row">
169
  <span class="example-badge">{ex["model"]}</span>
170
- <span class="example-badge kind">IMAGE</span>
171
- </div>
172
- <div class="example-prompt-text">{prompt_short}</div>
173
- </div>
174
- """
175
- for i, ex in enumerate(video_examples):
176
- thumb = make_video_thumb_b64(ex["video"])
177
- prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
178
- cards += f"""
179
- <div class="example-card" data-kind="video" data-idx="{i}">
180
- <div class="example-thumb-wrap">
181
- {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Video</div>"}
182
- </div>
183
- <div class="example-meta-row">
184
- <span class="example-badge">{ex["model"]}</span>
185
- <span class="example-badge kind video">VIDEO</span>
186
  </div>
187
  <div class="example-prompt-text">{prompt_short}</div>
188
  </div>
@@ -193,63 +180,84 @@ def build_example_cards_html():
193
  EXAMPLE_CARDS_HTML = build_example_cards_html()
194
 
195
 
196
- def load_example_data(kind, idx_str):
197
  try:
198
  idx = int(float(idx_str))
199
  except Exception:
200
  return json.dumps({"status": "error", "message": "Invalid example index"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- if kind == "image":
203
- if idx < 0 or idx >= len(image_examples):
204
- return json.dumps({"status": "error", "message": "Example index out of range"})
205
- ex = image_examples[idx]
206
- img_b64 = file_to_data_url(ex["image"])
207
- if not img_b64:
208
- return json.dumps({"status": "error", "message": "Could not load example image"})
209
- return json.dumps({
210
- "status": "ok",
211
- "kind": "image",
212
- "query": ex["query"],
213
- "file": img_b64,
214
- "model": ex["model"],
215
- "name": os.path.basename(ex["image"]),
216
- })
217
-
218
- if kind == "video":
219
- if idx < 0 or idx >= len(video_examples):
220
- return json.dumps({"status": "error", "message": "Example index out of range"})
221
- ex = video_examples[idx]
222
- vid_b64 = file_to_data_url(ex["video"])
223
- if not vid_b64:
224
- return json.dumps({"status": "error", "message": "Could not load example video"})
225
- return json.dumps({
226
- "status": "ok",
227
- "kind": "video",
228
- "query": ex["query"],
229
- "file": vid_b64,
230
- "model": ex["model"],
231
- "name": os.path.basename(ex["video"]),
232
- })
233
-
234
- return json.dumps({"status": "error", "message": "Invalid example kind"})
235
 
236
 
237
  def downsample_video(video_path):
238
  vidcap = cv2.VideoCapture(video_path)
239
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
240
- fps = vidcap.get(cv2.CAP_PROP_FPS)
241
  frames = []
242
- if total_frames <= 0 or fps <= 0:
 
243
  vidcap.release()
244
  return frames
245
- frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
246
  for i in frame_indices:
247
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
248
  success, image = vidcap.read()
249
  if success:
250
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
251
  pil_image = Image.fromarray(image)
252
- timestamp = round(i / fps, 2)
253
  frames.append((pil_image, timestamp))
254
  vidcap.release()
255
  return frames
@@ -276,7 +284,7 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
276
  if image is None:
277
  raise gr.Error("Please upload an image.")
278
  if not text or not str(text).strip():
279
- raise gr.Error("Please enter your vision/query instruction.")
280
  if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
281
  raise gr.Error("Query is too long. Please shorten your input.")
282
 
@@ -290,7 +298,12 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
290
  ]
291
  }]
292
 
293
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
294
  inputs = processor(
295
  text=[prompt_full],
296
  images=[image],
@@ -305,11 +318,11 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
305
  **inputs,
306
  "streamer": streamer,
307
  "max_new_tokens": int(max_new_tokens),
 
308
  "temperature": float(temperature),
309
  "top_p": float(top_p),
310
  "top_k": int(top_k),
311
  "repetition_penalty": float(repetition_penalty),
312
- "do_sample": True,
313
  }
314
 
315
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -333,21 +346,21 @@ def generate_video(model_name, text, video_path, max_new_tokens=1024, temperatur
333
  if not video_path:
334
  raise gr.Error("Please upload a video.")
335
  if not text or not str(text).strip():
336
- raise gr.Error("Please enter your vision/query instruction.")
337
  if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
338
  raise gr.Error("Query is too long. Please shorten your input.")
339
 
340
  processor, model = MODEL_MAP[model_name]
341
  frames = downsample_video(video_path)
342
  if not frames:
343
- raise gr.Error("Failed to read video frames.")
344
 
345
  messages = [
346
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
347
  {"role": "user", "content": [{"type": "text", "text": text}]}
348
  ]
349
- for frame in frames:
350
- image, timestamp = frame
351
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
352
  messages[1]["content"].append({"type": "image", "image": image})
353
 
@@ -387,6 +400,43 @@ def generate_video(model_name, text, video_path, max_new_tokens=1024, temperatur
387
  torch.cuda.empty_cache()
388
 
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  def noop():
391
  return None
392
 
@@ -396,11 +446,11 @@ css = r"""
396
  *{box-sizing:border-box;margin:0;padding:0}
397
  html,body{height:100%;overflow-x:hidden}
398
  body,.gradio-container{
399
- background:#0b1020!important;
400
  font-family:'Inter',system-ui,-apple-system,sans-serif!important;
401
  font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
402
  }
403
- .dark body,.dark .gradio-container{background:#0b1020!important;color:#e4e4e7!important}
404
  footer{display:none!important}
405
  .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
406
 
@@ -411,51 +461,64 @@ footer{display:none!important}
411
  }
412
 
413
  .app-shell{
414
- background:#11182d;border:1px solid #1e2b52;border-radius:16px;
415
  margin:12px auto;max-width:1400px;overflow:hidden;
416
  box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
417
  }
418
  .app-header{
419
- background:linear-gradient(135deg,#11182d,#152042);border-bottom:1px solid #1e2b52;
420
  padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
421
  }
422
  .app-header-left{display:flex;align-items:center;gap:12px}
423
  .app-logo{
424
- width:38px;height:38px;background:linear-gradient(135deg,#0000FF,#2e5bff,#6d8dff);
425
  border-radius:10px;display:flex;align-items:center;justify-content:center;
426
  box-shadow:0 4px 12px rgba(0,0,255,.35);
427
  }
428
  .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
429
  .app-title{
430
- font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#b8c5ff);
431
  -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
432
  }
433
  .app-badge{
434
  font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
435
- background:rgba(0,0,255,.12);color:#8ea2ff;border:1px solid rgba(0,0,255,.25);letter-spacing:.3px;
436
  }
437
- .app-badge.fast{background:rgba(46,91,255,.10);color:#93a7ff;border:1px solid rgba(46,91,255,.22)}
438
 
439
- .mode-tabs-bar,.model-tabs-bar{
440
- background:#11182d;border-bottom:1px solid #1e2b52;padding:10px 16px;
441
  display:flex;gap:8px;align-items:center;flex-wrap:wrap;
442
  }
443
- .model-tab,.mode-tab{
444
  display:inline-flex;align-items:center;justify-content:center;gap:6px;
445
- min-width:32px;height:34px;background:transparent;border:1px solid #243669;
446
  border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
447
  color:#ffffff!important;transition:all .15s ease;
448
  }
449
- .model-tab:hover,.mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
450
- .model-tab.active,.mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
451
- .model-tab-label,.mode-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
454
- .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #1e2b52}
455
- .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#11182d}
456
 
457
  #media-drop-zone{
458
- position:relative;background:#08101d;height:440px;min-height:440px;max-height:440px;
459
  overflow:hidden;
460
  }
461
  #media-drop-zone.drag-over{outline:2px solid #0000FF;outline-offset:-2px;background:rgba(0,0,255,.04)}
@@ -466,7 +529,7 @@ footer{display:none!important}
466
  .upload-click-area{
467
  display:flex;flex-direction:column;align-items:center;justify-content:center;
468
  cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
469
- border:2px dashed #32446d;border-radius:16px;
470
  background:rgba(0,0,255,.03);transition:all .2s ease;gap:8px;text-align:center;
471
  overflow:hidden;
472
  }
@@ -482,7 +545,7 @@ footer{display:none!important}
482
  }
483
  .single-preview-card{
484
  width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
485
- overflow:hidden;border:1px solid #1e2b52;background:#0d1425;
486
  display:flex;align-items:center;justify-content:center;position:relative;
487
  }
488
  .single-preview-card img,.single-preview-card video{
@@ -501,63 +564,67 @@ footer{display:none!important}
501
  .preview-action-btn:hover{background:#0000FF;border-color:#0000FF}
502
 
503
  .hint-bar{
504
- background:rgba(0,0,255,.06);border-top:1px solid #1e2b52;border-bottom:1px solid #1e2b52;
505
  padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
506
  }
507
- .hint-bar b{color:#8ea2ff;font-weight:600}
508
  .hint-bar kbd{
509
- display:inline-block;padding:1px 6px;background:#1b2646;border:1px solid #2d3b6d;
510
  border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
511
  }
512
 
513
- .examples-section{border-top:1px solid #1e2b52;padding:12px 16px}
514
  .examples-title{
515
  font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
516
  letter-spacing:.8px;margin-bottom:10px;
517
  }
518
  .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
519
  .examples-scroll::-webkit-scrollbar{height:6px}
520
- .examples-scroll::-webkit-scrollbar-track{background:#08101d;border-radius:3px}
521
- .examples-scroll::-webkit-scrollbar-thumb{background:#243669;border-radius:3px}
522
- .examples-scroll::-webkit-scrollbar-thumb:hover{background:#38529a}
523
  .example-card{
524
- flex-shrink:0;width:220px;background:#08101d;border:1px solid #1e2b52;
 
525
  border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
526
  }
527
  .example-card:hover{border-color:#0000FF;transform:translateY(-2px);box-shadow:0 4px 12px rgba(0,0,255,.15)}
528
  .example-card.loading{opacity:.5;pointer-events:none}
529
- .example-thumb-wrap{height:120px;overflow:hidden;background:#11182d}
530
  .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
 
 
 
 
 
531
  .example-thumb-placeholder{
532
  width:100%;height:100%;display:flex;align-items:center;justify-content:center;
533
- background:#11182d;color:#3f4e78;font-size:11px;
534
  }
535
- .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px;flex-wrap:wrap}
536
  .example-badge{
537
  display:inline-flex;padding:2px 7px;background:rgba(0,0,255,.12);border-radius:4px;
538
- font-size:10px;font-weight:600;color:#93a7ff;font-family:'JetBrains Mono',monospace;white-space:nowrap;
539
  }
540
- .example-badge.kind{background:rgba(100,130,255,.12);color:#bfd0ff}
541
- .example-badge.kind.video{background:rgba(0,90,255,.12);color:#a7c4ff}
542
  .example-prompt-text{
543
  padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
544
  display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
545
  }
546
 
547
- .panel-card{border-bottom:1px solid #1e2b52}
548
  .panel-card-title{
549
  padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
550
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
551
  }
552
  .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
553
  .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
554
  .modern-textarea{
555
- width:100%;background:#08101d;border:1px solid #1e2b52;border-radius:8px;
556
  padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
557
  resize:none;outline:none;min-height:100px;transition:border-color .2s;
558
  }
559
  .modern-textarea:focus{border-color:#0000FF;box-shadow:0 0 0 3px rgba(0,0,255,.15)}
560
- .modern-textarea::placeholder{color:#4e5d89}
561
  .modern-textarea.error-flash{
562
  border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
563
  }
@@ -572,21 +639,21 @@ footer{display:none!important}
572
  }
573
  .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
574
  .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
575
- .toast-notification.warning{background:linear-gradient(135deg,#1d4ed8,#1e40af);color:#fff;border:1px solid rgba(255,255,255,.15)}
576
- .toast-notification.info{background:linear-gradient(135deg,#0000FF,#1d4ed8);color:#fff;border:1px solid rgba(255,255,255,.15)}
577
  .toast-notification .toast-icon{font-size:16px;line-height:1}
578
  .toast-notification .toast-text{line-height:1.3}
579
 
580
  .btn-run{
581
  display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
582
- background:linear-gradient(135deg,#0000FF,#1d4ed8);border:none;border-radius:10px;
583
  padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
584
  color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
585
  transition:all .2s ease;letter-spacing:-.2px;
586
  box-shadow:0 4px 16px rgba(0,0,255,.3),inset 0 1px 0 rgba(255,255,255,.1);
587
  }
588
  .btn-run:hover{
589
- background:linear-gradient(135deg,#315cff,#0000FF);transform:translateY(-1px);
590
  box-shadow:0 6px 24px rgba(0,0,255,.45),inset 0 1px 0 rgba(255,255,255,.15);
591
  }
592
  .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(0,0,255,.3)}
@@ -594,7 +661,7 @@ footer{display:none!important}
594
  color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
595
  }
596
 
597
- .output-frame{border-bottom:1px solid #1e2b52;display:flex;flex-direction:column;position:relative}
598
  .output-frame .out-title,
599
  .output-frame .out-title *,
600
  #output-title-label{
@@ -603,104 +670,104 @@ footer{display:none!important}
603
  }
604
  .output-frame .out-title{
605
  padding:10px 20px;font-size:13px;font-weight:700;
606
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(30,43,82,.6);
607
  display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
608
  }
609
  .out-title-right{display:flex;gap:8px;align-items:center}
610
  .out-action-btn{
611
  display:inline-flex;align-items:center;justify-content:center;background:rgba(0,0,255,.1);
612
  border:1px solid rgba(0,0,255,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
613
- font-size:11px;font-weight:500;color:#93a7ff!important;gap:4px;height:24px;transition:all .15s;
614
  }
615
  .out-action-btn:hover{background:rgba(0,0,255,.2);border-color:rgba(0,0,255,.35);color:#ffffff!important}
616
- .out-action-btn svg{width:12px;height:12px;fill:#93a7ff}
617
  .output-frame .out-body{
618
- flex:1;background:#08101d;display:flex;align-items:stretch;justify-content:stretch;
619
  overflow:hidden;min-height:320px;position:relative;
620
  }
621
  .output-scroll-wrap{
622
  width:100%;height:100%;padding:0;overflow:hidden;
623
  }
624
  .output-textarea{
625
- width:100%;height:320px;min-height:320px;max-height:320px;background:#08101d;color:#e4e4e7;
626
  border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
627
  font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
628
  }
629
- .output-textarea::placeholder{color:#5f6d96}
630
  .output-textarea.error-flash{
631
  box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
632
  }
633
  .modern-loader{
634
- display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(8,16,29,.92);
635
  z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
636
  }
637
  .modern-loader.active{display:flex}
638
  .modern-loader .loader-spinner{
639
- width:36px;height:36px;border:3px solid #243669;border-top-color:#0000FF;
640
  border-radius:50%;animation:spin .8s linear infinite;
641
  }
642
  @keyframes spin{to{transform:rotate(360deg)}}
643
  .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
644
- .loader-bar-track{width:200px;height:4px;background:#243669;border-radius:2px;overflow:hidden}
645
  .loader-bar-fill{
646
- height:100%;background:linear-gradient(90deg,#0000FF,#4b74ff,#0000FF);
647
  background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
648
  }
649
  @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
650
 
651
- .settings-group{border:1px solid #1e2b52;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
652
  .settings-group-title{
653
  font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
654
- padding:10px 16px;border-bottom:1px solid #1e2b52;background:rgba(17,24,45,.5);
655
  }
656
  .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
657
  .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
658
  .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
659
  .slider-row input[type="range"]{
660
- flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#243669;
661
  border-radius:3px;outline:none;min-width:0;
662
  }
663
  .slider-row input[type="range"]::-webkit-slider-thumb{
664
- -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
665
  border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(0,0,255,.4);transition:transform .15s;
666
  }
667
  .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
668
  .slider-row input[type="range"]::-moz-range-thumb{
669
- width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#1d4ed8);
670
  border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(0,0,255,.4);
671
  }
672
  .slider-row .slider-val{
673
  min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
674
- font-weight:500;padding:3px 8px;background:#08101d;border:1px solid #1e2b52;
675
  border-radius:6px;color:#a1a1aa;flex-shrink:0;
676
  }
677
 
678
  .app-statusbar{
679
- background:#11182d;border-top:1px solid #1e2b52;padding:6px 20px;
680
  display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
681
  }
682
  .app-statusbar .sb-section{
683
  padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
684
- font-size:12px;color:#6b7cae;overflow:hidden;white-space:nowrap;
685
  }
686
  .app-statusbar .sb-section.sb-fixed{
687
  flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
688
- padding:3px 12px;background:rgba(0,0,255,.08);border-radius:6px;color:#93a7ff;font-weight:500;
689
  }
690
 
691
- .exp-note{padding:10px 20px;font-size:12px;color:#6b7cae;border-top:1px solid #1e2b52;text-align:center}
692
- .exp-note a{color:#93a7ff;text-decoration:none}
693
  .exp-note a:hover{text-decoration:underline}
694
 
695
  ::-webkit-scrollbar{width:8px;height:8px}
696
- ::-webkit-scrollbar-track{background:#08101d}
697
- ::-webkit-scrollbar-thumb{background:#243669;border-radius:4px}
698
- ::-webkit-scrollbar-thumb:hover{background:#38529a}
699
 
700
  @media(max-width:980px){
701
  .app-main-row{flex-direction:column}
702
  .app-main-right{width:100%}
703
- .app-main-left{border-right:none;border-bottom:1px solid #1e2b52}
704
  }
705
  """
706
 
@@ -730,7 +797,8 @@ function init() {
730
  }
731
 
732
  window.__visionScopeInitDone = true;
733
- let fileState = null;
 
734
  let toastTimer = null;
735
 
736
  function showToast(message, type) {
@@ -798,10 +866,10 @@ function init() {
798
  });
799
  }
800
 
801
- function syncFileToGradio() {
802
- setGradioValue('hidden-file-b64', fileState ? fileState.b64 : '');
803
- setGradioValue('hidden-input-kind', fileState ? fileState.kind : getActiveMode());
804
- const txt = fileState ? ('1 ' + fileState.kind + ' uploaded') : ('No ' + getActiveMode() + ' uploaded');
805
  if (mediaStatus) mediaStatus.textContent = txt;
806
  }
807
 
@@ -815,90 +883,63 @@ function init() {
815
 
816
  function syncModeToGradio(mode) {
817
  setGradioValue('hidden-mode-name', mode);
818
- setGradioValue('hidden-input-kind', fileState ? fileState.kind : mode);
819
- const sub = document.getElementById('upload-sub-text');
820
- const main = document.getElementById('upload-main-text');
821
- if (mode === 'video') {
822
- if (main) main.textContent = 'Click or drag a video here';
823
- if (sub) sub.textContent = 'Upload one short video clip for multimodal video understanding';
824
- } else {
825
- if (main) main.textContent = 'Click or drag an image here';
826
- if (sub) sub.textContent = 'Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks';
827
- }
828
- if (!fileState && mediaStatus) mediaStatus.textContent = 'No ' + mode + ' uploaded';
829
- }
830
-
831
- function getActiveMode() {
832
- const active = document.querySelector('.mode-tab.active');
833
- return active ? active.getAttribute('data-mode') : 'image';
834
  }
835
 
836
- function activateModeTab(name) {
837
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
838
- btn.classList.toggle('active', btn.getAttribute('data-mode') === name);
839
- });
840
- syncModeToGradio(name);
841
- if (fileState && fileState.kind !== name) {
842
- clearPreview();
 
 
 
843
  }
844
- }
845
- window.__activateModeTab = activateModeTab;
846
 
847
- function activateModelTab(name) {
848
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
849
- btn.classList.toggle('active', btn.getAttribute('data-model') === name);
850
- });
851
- syncModelToGradio(name);
852
- }
853
- window.__activateModelTab = activateModelTab;
854
-
855
- function setPreview(kind, b64, name) {
856
- fileState = {kind, b64, name: name || kind};
857
- if (kind === 'video') {
858
- previewVideo.src = b64;
859
- previewVideo.style.display = 'block';
860
  previewImg.style.display = 'none';
861
- previewVideo.load();
 
 
862
  } else {
863
- previewImg.src = b64;
864
- previewImg.style.display = 'block';
865
- previewVideo.style.display = 'none';
866
  previewVideo.removeAttribute('src');
 
 
 
 
 
867
  }
868
- previewWrap.style.display = 'flex';
869
  if (uploadPrompt) uploadPrompt.style.display = 'none';
870
- activateModeTab(kind);
871
- syncFileToGradio();
 
 
 
 
872
  }
873
  window.__setPreview = setPreview;
874
 
875
  function clearPreview() {
876
- fileState = null;
877
- previewImg.src = '';
878
- previewVideo.pause();
879
- previewVideo.removeAttribute('src');
880
- previewVideo.load();
881
- previewWrap.style.display = 'none';
882
- if (uploadPrompt) uploadPrompt.style.display = 'flex';
883
- syncFileToGradio();
884
  }
885
  window.__clearPreview = clearPreview;
886
 
887
  function processFile(file) {
888
  if (!file) return;
889
- const mode = getActiveMode();
890
-
891
- if (mode === 'image' && !file.type.startsWith('image/')) {
892
  showToast('Only image files are supported in Image mode', 'error');
893
  return;
894
  }
895
- if (mode === 'video' && !file.type.startsWith('video/')) {
896
  showToast('Only video files are supported in Video mode', 'error');
897
  return;
898
  }
899
-
900
  const reader = new FileReader();
901
- reader.onload = (e) => setPreview(mode, e.target.result, file.name);
902
  reader.readAsDataURL(file);
903
  }
904
 
@@ -908,6 +949,17 @@ function init() {
908
  e.target.value = '';
909
  });
910
 
 
 
 
 
 
 
 
 
 
 
 
911
  if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
912
  if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
913
  if (btnClear) btnClear.addEventListener('click', clearPreview);
@@ -928,18 +980,32 @@ function init() {
928
 
929
  promptInput.addEventListener('input', syncPromptToGradio);
930
 
931
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
932
- btn.addEventListener('click', () => {
933
- const model = btn.getAttribute('data-model');
934
- activateModelTab(model);
935
  });
936
- });
 
 
937
 
938
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
939
- btn.addEventListener('click', () => {
940
- const mode = btn.getAttribute('data-mode');
941
- activateModeTab(mode);
942
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
943
  });
944
 
945
  activateModelTab('DeepCaption-VLA-7B');
@@ -973,24 +1039,22 @@ function init() {
973
 
974
  function validateBeforeRun() {
975
  const promptVal = promptInput.value.trim();
976
- const currentMode = getActiveMode();
977
-
978
- if (!fileState && !promptVal) {
979
- showToast('Please upload a file and enter your instruction', 'error');
980
  flashPromptError();
981
  return false;
982
  }
983
- if (!fileState) {
984
- showToast('Please upload a ' + currentMode, 'error');
985
  return false;
986
  }
987
- if (!promptVal) {
988
- showToast('Please enter your vision/query instruction', 'warning');
989
- flashPromptError();
990
  return false;
991
  }
992
- if (fileState.kind !== currentMode) {
993
- showToast('Uploaded file type does not match active mode', 'error');
 
994
  return false;
995
  }
996
  const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
@@ -1004,10 +1068,11 @@ function init() {
1004
  window.__clickGradioRunBtn = function() {
1005
  if (!validateBeforeRun()) return;
1006
  syncPromptToGradio();
1007
- syncFileToGradio();
1008
  const activeModel = document.querySelector('.model-tab.active');
1009
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1010
- syncModeToGradio(getActiveMode());
 
1011
  if (outputArea) outputArea.value = '';
1012
  showLoader();
1013
  setTimeout(() => {
@@ -1027,6 +1092,7 @@ function init() {
1027
  const text = outputArea ? outputArea.value : '';
1028
  if (!text.trim()) {
1029
  showToast('No output to copy', 'warning');
 
1030
  return;
1031
  }
1032
  await navigator.clipboard.writeText(text);
@@ -1063,12 +1129,10 @@ function init() {
1063
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1064
  card.addEventListener('click', () => {
1065
  const idx = card.getAttribute('data-idx');
1066
- const kind = card.getAttribute('data-kind');
1067
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1068
  card.classList.add('loading');
1069
  showToast('Loading example...', 'info');
1070
  setGradioValue('example-result-data', '');
1071
- setGradioValue('example-kind-input', kind);
1072
  setGradioValue('example-idx-input', idx);
1073
  setTimeout(() => {
1074
  const btn = document.getElementById('example-load-btn');
@@ -1090,13 +1154,13 @@ function init() {
1090
  const data = JSON.parse(el.value);
1091
  if (data.status === 'ok') {
1092
  window.__lastExampleVal = el.value;
1093
- if (data.file && data.kind) setPreview(data.kind, data.file, data.name || 'example');
 
1094
  if (data.query) {
1095
  promptInput.value = data.query;
1096
  syncPromptToGradio();
1097
  }
1098
  if (data.model) activateModelTab(data.model);
1099
- if (data.kind) activateModeTab(data.kind);
1100
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1101
  showToast('Example loaded', 'info');
1102
  } else if (data.status === 'error') {
@@ -1150,10 +1214,9 @@ watchOutputs();
1150
  }
1151
  """
1152
 
1153
- APP_LOGO_SVG = """
1154
  <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1155
- <path d="M12 2 4 6v6c0 5 3.4 9.4 8 10 4.6-.6 8-5 8-10V6l-8-4Z" fill="white"/>
1156
- <path d="M9 11.5 11 13.5 15.5 9" stroke="#0000FF" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
1157
  </svg>
1158
  """
1159
 
@@ -1174,14 +1237,14 @@ MODEL_TABS_HTML = "".join([
1174
  ])
1175
 
1176
  MODE_TABS_HTML = """
1177
- <button class="mode-tab active" data-mode="image"><span class="mode-tab-label">Image Inference</span></button>
1178
- <button class="mode-tab" data-mode="video"><span class="mode-tab-label">Video Inference</span></button>
1179
  """
1180
 
1181
  with gr.Blocks() as demo:
1182
- hidden_file_b64 = gr.Textbox(value="", elem_id="hidden-file-b64", elem_classes="hidden-input", container=False)
1183
  hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1184
- hidden_input_kind = gr.Textbox(value="image", elem_id="hidden-input-kind", elem_classes="hidden-input", container=False)
 
1185
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1186
  hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1187
 
@@ -1194,7 +1257,6 @@ with gr.Blocks() as demo:
1194
 
1195
  result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1196
 
1197
- example_kind = gr.Textbox(value="", elem_id="example-kind-input", elem_classes="hidden-input", container=False)
1198
  example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1199
  example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1200
  example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
@@ -1203,21 +1265,21 @@ with gr.Blocks() as demo:
1203
  <div class="app-shell">
1204
  <div class="app-header">
1205
  <div class="app-header-left">
1206
- <div class="app-logo">{APP_LOGO_SVG}</div>
1207
  <span class="app-title">VisionScope R2</span>
1208
  <span class="app-badge">vision enabled</span>
1209
- <span class="app-badge fast">Blue Suite</span>
1210
  </div>
1211
  </div>
1212
 
1213
- <div class="mode-tabs-bar">
1214
- {MODE_TABS_HTML}
1215
- </div>
1216
-
1217
  <div class="model-tabs-bar">
1218
  {MODEL_TABS_HTML}
1219
  </div>
1220
 
 
 
 
 
1221
  <div class="app-main-row">
1222
  <div class="app-main-left">
1223
  <div id="media-drop-zone">
@@ -1225,11 +1287,11 @@ with gr.Blocks() as demo:
1225
  <div id="upload-click-area" class="upload-click-area">
1226
  {UPLOAD_PREVIEW_SVG}
1227
  <span id="upload-main-text" class="upload-main-text">Click or drag an image here</span>
1228
- <span id="upload-sub-text" class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks</span>
1229
  </div>
1230
  </div>
1231
 
1232
- <input id="custom-file-input" type="file" accept="image/*,video/*" style="display:none;" />
1233
 
1234
  <div id="single-preview-wrap" class="single-preview-wrap">
1235
  <div class="single-preview-card">
@@ -1244,9 +1306,10 @@ with gr.Blocks() as demo:
1244
  </div>
1245
 
1246
  <div class="hint-bar">
1247
- <b>Upload:</b> Click or drag to add an image or video &nbsp;&middot;&nbsp;
1248
  <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1249
- <b>Model:</b> Choose model tabs from the header
 
1250
  </div>
1251
 
1252
  <div class="examples-section">
@@ -1262,13 +1325,13 @@ with gr.Blocks() as demo:
1262
  <div class="panel-card-title">Vision Instruction</div>
1263
  <div class="panel-card-body">
1264
  <label class="modern-label" for="custom-query-input">Query Input</label>
1265
- <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., extract the text, describe the image, explain the scene, summarize the video, count objects, estimate distance..."></textarea>
1266
  </div>
1267
  </div>
1268
 
1269
  <div style="padding:12px 20px;">
1270
  <button id="custom-run-btn" class="btn-run">
1271
- <span id="run-btn-label">Run Vision</span>
1272
  </button>
1273
  </div>
1274
 
@@ -1283,7 +1346,7 @@ with gr.Blocks() as demo:
1283
  <div class="out-body">
1284
  <div class="modern-loader" id="output-loader">
1285
  <div class="loader-spinner"></div>
1286
- <div class="loader-text">Running vision inference...</div>
1287
  <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1288
  </div>
1289
  <div class="output-scroll-wrap">
@@ -1331,7 +1394,7 @@ with gr.Blocks() as demo:
1331
  </div>
1332
 
1333
  <div class="exp-note">
1334
- Experimental Vision Suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/VisionScope-R2" target="_blank">GitHub</a>
1335
  </div>
1336
 
1337
  <div class="app-statusbar">
@@ -1343,86 +1406,17 @@ with gr.Blocks() as demo:
1343
 
1344
  run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1345
 
1346
- def b64_to_pil(b64_str):
1347
- if not b64_str:
1348
- return None
1349
- try:
1350
- if b64_str.startswith("data:image"):
1351
- _, data = b64_str.split(",", 1)
1352
- else:
1353
- data = b64_str
1354
- image_data = base64.b64decode(data)
1355
- return Image.open(BytesIO(image_data)).convert("RGB")
1356
- except Exception:
1357
- return None
1358
-
1359
- def b64_to_temp_video(b64_str):
1360
- if not b64_str:
1361
- return None
1362
- try:
1363
- header, data = b64_str.split(",", 1) if "," in b64_str else ("", b64_str)
1364
- os.makedirs("/tmp/visionscope_r2", exist_ok=True)
1365
- ext = ".mp4"
1366
- if "video/webm" in header:
1367
- ext = ".webm"
1368
- elif "video/quicktime" in header:
1369
- ext = ".mov"
1370
- elif "video/x-matroska" in header:
1371
- ext = ".mkv"
1372
- path = f"/tmp/visionscope_r2/{uuid.uuid4().hex}{ext}"
1373
- with open(path, "wb") as f:
1374
- f.write(base64.b64decode(data))
1375
- return path
1376
- except Exception:
1377
- return None
1378
-
1379
- def run_vision(mode_name, model_name, text, file_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
1380
- if mode_name == "video":
1381
- temp_video_path = b64_to_temp_video(file_b64)
1382
- if not temp_video_path:
1383
- raise gr.Error("Failed to decode uploaded video.")
1384
- try:
1385
- yield from generate_video(
1386
- model_name=model_name,
1387
- text=text,
1388
- video_path=temp_video_path,
1389
- max_new_tokens=max_new_tokens_v,
1390
- temperature=temperature_v,
1391
- top_p=top_p_v,
1392
- top_k=top_k_v,
1393
- repetition_penalty=repetition_penalty_v,
1394
- gpu_timeout=gpu_timeout_v,
1395
- )
1396
- finally:
1397
- try:
1398
- if os.path.exists(temp_video_path):
1399
- os.remove(temp_video_path)
1400
- except Exception:
1401
- pass
1402
- else:
1403
- image = b64_to_pil(file_b64)
1404
- yield from generate_image(
1405
- model_name=model_name,
1406
- text=text,
1407
- image=image,
1408
- max_new_tokens=max_new_tokens_v,
1409
- temperature=temperature_v,
1410
- top_p=top_p_v,
1411
- top_k=top_k_v,
1412
- repetition_penalty=repetition_penalty_v,
1413
- gpu_timeout=gpu_timeout_v,
1414
- )
1415
-
1416
  demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1417
  demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1418
 
1419
  run_btn.click(
1420
- fn=run_vision,
1421
  inputs=[
1422
  hidden_mode_name,
1423
  hidden_model_name,
1424
  prompt,
1425
- hidden_file_b64,
 
1426
  max_new_tokens,
1427
  temperature,
1428
  top_p,
@@ -1431,26 +1425,36 @@ with gr.Blocks() as demo:
1431
  gpu_duration_state,
1432
  ],
1433
  outputs=[result],
1434
- js=r"""(mode, model, p, filev, mnt, t, tp, tk, rp, gd) => {
1435
  const modelEl = document.querySelector('.model-tab.active');
1436
  const modeEl = document.querySelector('.mode-tab.active');
1437
- const chosenModel = modelEl ? modelEl.getAttribute('data-model') : model;
1438
- const chosenMode = modeEl ? modeEl.getAttribute('data-mode') : mode;
1439
  const promptEl = document.getElementById('custom-query-input');
1440
  const promptVal = promptEl ? promptEl.value : p;
1441
- const fileContainer = document.getElementById('hidden-file-b64');
1442
- let fileVal = filev;
1443
- if (fileContainer) {
1444
- const inner = fileContainer.querySelector('textarea, input');
1445
- if (inner) fileVal = inner.value;
 
 
 
 
 
1446
  }
1447
- return [chosenMode, chosenModel, promptVal, fileVal, mnt, t, tp, tk, rp, gd];
 
 
 
 
 
1448
  }""",
1449
  )
1450
 
1451
  example_load_btn.click(
1452
  fn=load_example_data,
1453
- inputs=[example_kind, example_idx],
1454
  outputs=[example_result],
1455
  queue=False,
1456
  )
 
1
  import os
2
  import gc
3
  import re
 
4
  import json
 
 
5
  import uuid
6
  import time
7
+ import base64
8
+ import random
9
  from io import BytesIO
10
  from threading import Thread
11
 
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
  print("Using device:", device)
32
 
33
+
34
+ def load_model(model_id, cls, **kwargs):
35
+ return cls.from_pretrained(
36
+ model_id,
37
+ trust_remote_code=True,
38
+ torch_dtype=torch.float16,
39
+ **kwargs
40
+ ).to(device).eval()
41
+
42
+
43
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
44
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
45
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
92
  "coreOCR-7B-050325-preview": (processor_k, model_k),
93
  "SpaceOm-3B": (processor_y, model_y),
94
  }
95
+
96
  MODEL_CHOICES = list(MODEL_MAP.keys())
97
 
98
  image_examples = [
99
+ {"query": "type out the messy hand-writing as accurately as you can.", "media": "images/1.jpg", "model": "coreOCR-7B-050325-preview", "mode": "image"},
100
+ {"query": "count the number of birds and explain the scene in detail.", "media": "images/2.jpeg", "model": "DeepCaption-VLA-7B", "mode": "image"},
101
+ {"query": "how far is the Goal from the penalty taker in this image?.", "media": "images/3.png", "model": "SpaceThinker-3B", "mode": "image"},
102
+ {"query": "approximately how many meters apart are the chair and bookshelf?.", "media": "images/4.png", "model": "SkyCaptioner-V1", "mode": "image"},
103
+ {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "media": "images/5.jpg", "model": "SpaceOm-3B", "mode": "image"},
104
  ]
105
 
106
  video_examples = [
107
+ {"query": "give the highlights of the movie scene video.", "media": "videos/1.mp4", "model": "DeepCaption-VLA-7B", "mode": "video"},
108
+ {"query": "explain the advertisement in detail.", "media": "videos/2.mp4", "model": "SkyCaptioner-V1", "mode": "video"},
109
  ]
110
 
111
+ all_examples = image_examples + video_examples
112
+
113
 
114
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
115
  buf = BytesIO()
 
131
  "mp4": "video/mp4",
132
  "mov": "video/quicktime",
133
  "webm": "video/webm",
 
134
  }.get(ext, "application/octet-stream")
135
  with open(path, "rb") as f:
136
  data = base64.b64encode(f.read()).decode()
137
  return f"data:{mime};base64,{data}"
138
 
139
 
140
+ def make_thumb_b64(path, mode="image", max_dim=240):
141
  try:
142
+ if mode == "video":
143
+ cap = cv2.VideoCapture(path)
144
+ ok, frame = cap.read()
145
+ cap.release()
146
+ if not ok:
147
+ return ""
148
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
149
+ img = Image.fromarray(frame).convert("RGB")
150
+ else:
151
+ img = Image.open(path).convert("RGB")
152
  img.thumbnail((max_dim, max_dim))
153
  return pil_to_data_url(img, "JPEG")
154
  except Exception as e:
 
156
  return ""
157
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def build_example_cards_html():
160
  cards = ""
161
+ for i, ex in enumerate(all_examples):
162
+ thumb = make_thumb_b64(ex["media"], ex["mode"])
163
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
164
+ media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
165
  cards += f"""
166
+ <div class="example-card" data-idx="{i}">
167
  <div class="example-thumb-wrap">
168
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
169
+ <div class="example-media-chip">{media_badge}</div>
170
  </div>
171
  <div class="example-meta-row">
172
  <span class="example-badge">{ex["model"]}</span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  </div>
174
  <div class="example-prompt-text">{prompt_short}</div>
175
  </div>
 
180
  EXAMPLE_CARDS_HTML = build_example_cards_html()
181
 
182
 
183
+ def load_example_data(idx_str):
184
  try:
185
  idx = int(float(idx_str))
186
  except Exception:
187
  return json.dumps({"status": "error", "message": "Invalid example index"})
188
+ if idx < 0 or idx >= len(all_examples):
189
+ return json.dumps({"status": "error", "message": "Example index out of range"})
190
+ ex = all_examples[idx]
191
+ media_b64 = file_to_data_url(ex["media"])
192
+ if not media_b64:
193
+ return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
194
+ return json.dumps({
195
+ "status": "ok",
196
+ "query": ex["query"],
197
+ "media": media_b64,
198
+ "model": ex["model"],
199
+ "mode": ex["mode"],
200
+ "name": os.path.basename(ex["media"]),
201
+ })
202
+
203
+
204
+ def b64_to_pil(b64_str):
205
+ if not b64_str:
206
+ return None
207
+ try:
208
+ if b64_str.startswith("data:"):
209
+ _, data = b64_str.split(",", 1)
210
+ else:
211
+ data = b64_str
212
+ image_data = base64.b64decode(data)
213
+ return Image.open(BytesIO(image_data)).convert("RGB")
214
+ except Exception:
215
+ return None
216
 
217
+
218
+ def b64_to_temp_video(b64_str):
219
+ if not b64_str:
220
+ return None
221
+ try:
222
+ if b64_str.startswith("data:"):
223
+ header, data = b64_str.split(",", 1)
224
+ mime = header.split(";")[0].replace("data:", "")
225
+ else:
226
+ data = b64_str
227
+ mime = "video/mp4"
228
+ ext = {
229
+ "video/mp4": ".mp4",
230
+ "video/webm": ".webm",
231
+ "video/quicktime": ".mov",
232
+ }.get(mime, ".mp4")
233
+ raw = base64.b64decode(data)
234
+ temp_dir = os.path.join("/tmp", "visionscope_r2_media")
235
+ os.makedirs(temp_dir, exist_ok=True)
236
+ path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
237
+ with open(path, "wb") as f:
238
+ f.write(raw)
239
+ return path
240
+ except Exception:
241
+ return None
 
 
 
 
 
 
 
 
242
 
243
 
244
  def downsample_video(video_path):
245
  vidcap = cv2.VideoCapture(video_path)
246
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
247
+ fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
248
  frames = []
249
+ frame_count = min(total_frames, 10) if total_frames > 0 else 0
250
+ if frame_count == 0:
251
  vidcap.release()
252
  return frames
253
+ frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
254
  for i in frame_indices:
255
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
256
  success, image = vidcap.read()
257
  if success:
258
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
259
  pil_image = Image.fromarray(image)
260
+ timestamp = round(float(i) / float(fps), 2)
261
  frames.append((pil_image, timestamp))
262
  vidcap.release()
263
  return frames
 
284
  if image is None:
285
  raise gr.Error("Please upload an image.")
286
  if not text or not str(text).strip():
287
+ raise gr.Error("Please enter your instruction.")
288
  if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
289
  raise gr.Error("Query is too long. Please shorten your input.")
290
 
 
298
  ]
299
  }]
300
 
301
+ prompt_full = processor.apply_chat_template(
302
+ messages,
303
+ tokenize=False,
304
+ add_generation_prompt=True
305
+ )
306
+
307
  inputs = processor(
308
  text=[prompt_full],
309
  images=[image],
 
318
  **inputs,
319
  "streamer": streamer,
320
  "max_new_tokens": int(max_new_tokens),
321
+ "do_sample": True,
322
  "temperature": float(temperature),
323
  "top_p": float(top_p),
324
  "top_k": int(top_k),
325
  "repetition_penalty": float(repetition_penalty),
 
326
  }
327
 
328
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
346
  if not video_path:
347
  raise gr.Error("Please upload a video.")
348
  if not text or not str(text).strip():
349
+ raise gr.Error("Please enter your instruction.")
350
  if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
351
  raise gr.Error("Query is too long. Please shorten your input.")
352
 
353
  processor, model = MODEL_MAP[model_name]
354
  frames = downsample_video(video_path)
355
  if not frames:
356
+ raise gr.Error("Could not read the uploaded video.")
357
 
358
  messages = [
359
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
360
  {"role": "user", "content": [{"type": "text", "text": text}]}
361
  ]
362
+
363
+ for image, timestamp in frames:
364
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
365
  messages[1]["content"].append({"type": "image", "image": image})
366
 
 
400
  torch.cuda.empty_cache()
401
 
402
 
403
+ def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
404
+ if mode == "video":
405
+ temp_video_path = b64_to_temp_video(video_b64)
406
+ if not temp_video_path:
407
+ raise gr.Error("Could not decode uploaded video.")
408
+ try:
409
+ yield from generate_video(
410
+ model_name=model_name,
411
+ text=text,
412
+ video_path=temp_video_path,
413
+ max_new_tokens=max_new_tokens_v,
414
+ temperature=temperature_v,
415
+ top_p=top_p_v,
416
+ top_k=top_k_v,
417
+ repetition_penalty=repetition_penalty_v,
418
+ gpu_timeout=gpu_timeout_v,
419
+ )
420
+ finally:
421
+ try:
422
+ os.remove(temp_video_path)
423
+ except Exception:
424
+ pass
425
+ else:
426
+ image = b64_to_pil(image_b64)
427
+ yield from generate_image(
428
+ model_name=model_name,
429
+ text=text,
430
+ image=image,
431
+ max_new_tokens=max_new_tokens_v,
432
+ temperature=temperature_v,
433
+ top_p=top_p_v,
434
+ top_k=top_k_v,
435
+ repetition_penalty=repetition_penalty_v,
436
+ gpu_timeout=gpu_timeout_v,
437
+ )
438
+
439
+
440
  def noop():
441
  return None
442
 
 
446
  *{box-sizing:border-box;margin:0;padding:0}
447
  html,body{height:100%;overflow-x:hidden}
448
  body,.gradio-container{
449
+ background:#0f0f13!important;
450
  font-family:'Inter',system-ui,-apple-system,sans-serif!important;
451
  font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
452
  }
453
+ .dark body,.dark .gradio-container{background:#0f0f13!important;color:#e4e4e7!important}
454
  footer{display:none!important}
455
  .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
456
 
 
461
  }
462
 
463
  .app-shell{
464
+ background:#18181b;border:1px solid #27272a;border-radius:16px;
465
  margin:12px auto;max-width:1400px;overflow:hidden;
466
  box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
467
  }
468
  .app-header{
469
+ background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a;
470
  padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
471
  }
472
  .app-header-left{display:flex;align-items:center;gap:12px}
473
  .app-logo{
474
+ width:38px;height:38px;background:linear-gradient(135deg,#0000FF,#335CFF,#6680FF);
475
  border-radius:10px;display:flex;align-items:center;justify-content:center;
476
  box-shadow:0 4px 12px rgba(0,0,255,.35);
477
  }
478
  .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
479
  .app-title{
480
+ font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#bdbdbd);
481
  -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
482
  }
483
  .app-badge{
484
  font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
485
+ background:rgba(0,0,255,.12);color:#8aa2ff;border:1px solid rgba(0,0,255,.25);letter-spacing:.3px;
486
  }
487
+ .app-badge.fast{background:rgba(51,92,255,.10);color:#7f9cff;border:1px solid rgba(51,92,255,.22)}
488
 
489
+ .model-tabs-bar{
490
+ background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px;
491
  display:flex;gap:8px;align-items:center;flex-wrap:wrap;
492
  }
493
+ .model-tab{
494
  display:inline-flex;align-items:center;justify-content:center;gap:6px;
495
+ min-width:32px;height:34px;background:transparent;border:1px solid #27272a;
496
  border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
497
  color:#ffffff!important;transition:all .15s ease;
498
  }
499
+ .model-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
500
+ .model-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
501
+ .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
502
+
503
+ .mode-tabs-bar{
504
+ background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
505
+ display:flex;gap:8px;align-items:center;flex-wrap:wrap;
506
+ }
507
+ .mode-tab{
508
+ display:inline-flex;align-items:center;justify-content:center;gap:6px;
509
+ min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
510
+ border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
511
+ color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
512
+ }
513
+ .mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
514
+ .mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
515
 
516
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
517
+ .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
518
+ .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
519
 
520
  #media-drop-zone{
521
+ position:relative;background:#09090b;height:440px;min-height:440px;max-height:440px;
522
  overflow:hidden;
523
  }
524
  #media-drop-zone.drag-over{outline:2px solid #0000FF;outline-offset:-2px;background:rgba(0,0,255,.04)}
 
529
  .upload-click-area{
530
  display:flex;flex-direction:column;align-items:center;justify-content:center;
531
  cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
532
+ border:2px dashed #3f3f46;border-radius:16px;
533
  background:rgba(0,0,255,.03);transition:all .2s ease;gap:8px;text-align:center;
534
  overflow:hidden;
535
  }
 
545
  }
546
  .single-preview-card{
547
  width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
548
+ overflow:hidden;border:1px solid #27272a;background:#111114;
549
  display:flex;align-items:center;justify-content:center;position:relative;
550
  }
551
  .single-preview-card img,.single-preview-card video{
 
564
  .preview-action-btn:hover{background:#0000FF;border-color:#0000FF}
565
 
566
  .hint-bar{
567
+ background:rgba(0,0,255,.06);border-top:1px solid #27272a;border-bottom:1px solid #27272a;
568
  padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
569
  }
570
+ .hint-bar b{color:#8aa2ff;font-weight:600}
571
  .hint-bar kbd{
572
+ display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46;
573
  border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
574
  }
575
 
576
+ .examples-section{border-top:1px solid #27272a;padding:12px 16px}
577
  .examples-title{
578
  font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
579
  letter-spacing:.8px;margin-bottom:10px;
580
  }
581
  .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
582
  .examples-scroll::-webkit-scrollbar{height:6px}
583
+ .examples-scroll::-webkit-scrollbar-track{background:#09090b;border-radius:3px}
584
+ .examples-scroll::-webkit-scrollbar-thumb{background:#27272a;border-radius:3px}
585
+ .examples-scroll::-webkit-scrollbar-thumb:hover{background:#3f3f46}
586
  .example-card{
587
+ position:relative;
588
+ flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a;
589
  border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
590
  }
591
  .example-card:hover{border-color:#0000FF;transform:translateY(-2px);box-shadow:0 4px 12px rgba(0,0,255,.15)}
592
  .example-card.loading{opacity:.5;pointer-events:none}
593
+ .example-thumb-wrap{height:120px;overflow:hidden;background:#18181b;position:relative}
594
  .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
595
+ .example-media-chip{
596
+ position:absolute;top:8px;left:8px;
597
+ display:inline-flex;padding:3px 7px;background:rgba(0,0,0,.7);border:1px solid rgba(255,255,255,.12);
598
+ border-radius:999px;font-size:10px;font-weight:700;color:#fff;letter-spacing:.5px;
599
+ }
600
  .example-thumb-placeholder{
601
  width:100%;height:100%;display:flex;align-items:center;justify-content:center;
602
+ background:#18181b;color:#3f3f46;font-size:11px;
603
  }
604
+ .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px}
605
  .example-badge{
606
  display:inline-flex;padding:2px 7px;background:rgba(0,0,255,.12);border-radius:4px;
607
+ font-size:10px;font-weight:600;color:#8aa2ff;font-family:'JetBrains Mono',monospace;white-space:nowrap;
608
  }
 
 
609
  .example-prompt-text{
610
  padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
611
  display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
612
  }
613
 
614
+ .panel-card{border-bottom:1px solid #27272a}
615
  .panel-card-title{
616
  padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
617
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
618
  }
619
  .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
620
  .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
621
  .modern-textarea{
622
+ width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px;
623
  padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
624
  resize:none;outline:none;min-height:100px;transition:border-color .2s;
625
  }
626
  .modern-textarea:focus{border-color:#0000FF;box-shadow:0 0 0 3px rgba(0,0,255,.15)}
627
+ .modern-textarea::placeholder{color:#3f3f46}
628
  .modern-textarea.error-flash{
629
  border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
630
  }
 
639
  }
640
  .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
641
  .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
642
+ .toast-notification.warning{background:linear-gradient(135deg,#d97706,#b45309);color:#fff;border:1px solid rgba(255,255,255,.15)}
643
+ .toast-notification.info{background:linear-gradient(135deg,#1d4ed8,#1e40af);color:#fff;border:1px solid rgba(255,255,255,.15)}
644
  .toast-notification .toast-icon{font-size:16px;line-height:1}
645
  .toast-notification .toast-text{line-height:1.3}
646
 
647
  .btn-run{
648
  display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
649
+ background:linear-gradient(135deg,#0000FF,#003DCC);border:none;border-radius:10px;
650
  padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
651
  color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
652
  transition:all .2s ease;letter-spacing:-.2px;
653
  box-shadow:0 4px 16px rgba(0,0,255,.3),inset 0 1px 0 rgba(255,255,255,.1);
654
  }
655
  .btn-run:hover{
656
+ background:linear-gradient(135deg,#335CFF,#0000FF);transform:translateY(-1px);
657
  box-shadow:0 6px 24px rgba(0,0,255,.45),inset 0 1px 0 rgba(255,255,255,.15);
658
  }
659
  .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(0,0,255,.3)}
 
661
  color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
662
  }
663
 
664
+ .output-frame{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}
665
  .output-frame .out-title,
666
  .output-frame .out-title *,
667
  #output-title-label{
 
670
  }
671
  .output-frame .out-title{
672
  padding:10px 20px;font-size:13px;font-weight:700;
673
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
674
  display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
675
  }
676
  .out-title-right{display:flex;gap:8px;align-items:center}
677
  .out-action-btn{
678
  display:inline-flex;align-items:center;justify-content:center;background:rgba(0,0,255,.1);
679
  border:1px solid rgba(0,0,255,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
680
+ font-size:11px;font-weight:500;color:#8aa2ff!important;gap:4px;height:24px;transition:all .15s;
681
  }
682
  .out-action-btn:hover{background:rgba(0,0,255,.2);border-color:rgba(0,0,255,.35);color:#ffffff!important}
683
+ .out-action-btn svg{width:12px;height:12px;fill:#8aa2ff}
684
  .output-frame .out-body{
685
+ flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch;
686
  overflow:hidden;min-height:320px;position:relative;
687
  }
688
  .output-scroll-wrap{
689
  width:100%;height:100%;padding:0;overflow:hidden;
690
  }
691
  .output-textarea{
692
+ width:100%;height:320px;min-height:320px;max-height:320px;background:#09090b;color:#e4e4e7;
693
  border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
694
  font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
695
  }
696
+ .output-textarea::placeholder{color:#52525b}
697
  .output-textarea.error-flash{
698
  box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
699
  }
700
  .modern-loader{
701
+ display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92);
702
  z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
703
  }
704
  .modern-loader.active{display:flex}
705
  .modern-loader .loader-spinner{
706
+ width:36px;height:36px;border:3px solid #27272a;border-top-color:#0000FF;
707
  border-radius:50%;animation:spin .8s linear infinite;
708
  }
709
  @keyframes spin{to{transform:rotate(360deg)}}
710
  .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
711
+ .loader-bar-track{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden}
712
  .loader-bar-fill{
713
+ height:100%;background:linear-gradient(90deg,#0000FF,#6680FF,#0000FF);
714
  background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
715
  }
716
  @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
717
 
718
+ .settings-group{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
719
  .settings-group-title{
720
  font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
721
+ padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5);
722
  }
723
  .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
724
  .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
725
  .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
726
  .slider-row input[type="range"]{
727
+ flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a;
728
  border-radius:3px;outline:none;min-width:0;
729
  }
730
  .slider-row input[type="range"]::-webkit-slider-thumb{
731
+ -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#003DCC);
732
  border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(0,0,255,.4);transition:transform .15s;
733
  }
734
  .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
735
  .slider-row input[type="range"]::-moz-range-thumb{
736
+ width:16px;height:16px;background:linear-gradient(135deg,#0000FF,#003DCC);
737
  border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(0,0,255,.4);
738
  }
739
  .slider-row .slider-val{
740
  min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
741
+ font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a;
742
  border-radius:6px;color:#a1a1aa;flex-shrink:0;
743
  }
744
 
745
  .app-statusbar{
746
+ background:#18181b;border-top:1px solid #27272a;padding:6px 20px;
747
  display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
748
  }
749
  .app-statusbar .sb-section{
750
  padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
751
+ font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap;
752
  }
753
  .app-statusbar .sb-section.sb-fixed{
754
  flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
755
+ padding:3px 12px;background:rgba(0,0,255,.08);border-radius:6px;color:#8aa2ff;font-weight:500;
756
  }
757
 
758
+ .exp-note{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center}
759
+ .exp-note a{color:#8aa2ff;text-decoration:none}
760
  .exp-note a:hover{text-decoration:underline}
761
 
762
  ::-webkit-scrollbar{width:8px;height:8px}
763
+ ::-webkit-scrollbar-track{background:#09090b}
764
+ ::-webkit-scrollbar-thumb{background:#27272a;border-radius:4px}
765
+ ::-webkit-scrollbar-thumb:hover{background:#3f3f46}
766
 
767
  @media(max-width:980px){
768
  .app-main-row{flex-direction:column}
769
  .app-main-right{width:100%}
770
+ .app-main-left{border-right:none;border-bottom:1px solid #27272a}
771
  }
772
  """
773
 
 
797
  }
798
 
799
  window.__visionScopeInitDone = true;
800
+ let mediaState = null;
801
+ let currentMode = 'image';
802
  let toastTimer = null;
803
 
804
  function showToast(message, type) {
 
866
  });
867
  }
868
 
869
+ function syncMediaToGradio() {
870
+ setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
871
+ setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
872
+ const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
873
  if (mediaStatus) mediaStatus.textContent = txt;
874
  }
875
 
 
883
 
884
  function syncModeToGradio(mode) {
885
  setGradioValue('hidden-mode-name', mode);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886
  }
887
 
888
+ function renderPreview() {
889
+ if (!mediaState) {
890
+ previewImg.src = '';
891
+ previewVideo.src = '';
892
+ previewImg.style.display = 'none';
893
+ previewVideo.style.display = 'none';
894
+ previewWrap.style.display = 'none';
895
+ if (uploadPrompt) uploadPrompt.style.display = 'flex';
896
+ syncMediaToGradio();
897
+ return;
898
  }
 
 
899
 
900
+ if (mediaState.mode === 'video') {
901
+ previewImg.src = '';
 
 
 
 
 
 
 
 
 
 
 
902
  previewImg.style.display = 'none';
903
+ previewVideo.src = mediaState.b64;
904
+ previewVideo.style.display = 'block';
905
+ previewWrap.style.display = 'flex';
906
  } else {
907
+ previewVideo.pause();
 
 
908
  previewVideo.removeAttribute('src');
909
+ previewVideo.load();
910
+ previewVideo.style.display = 'none';
911
+ previewImg.src = mediaState.b64;
912
+ previewImg.style.display = 'block';
913
+ previewWrap.style.display = 'flex';
914
  }
 
915
  if (uploadPrompt) uploadPrompt.style.display = 'none';
916
+ syncMediaToGradio();
917
+ }
918
+
919
+ function setPreview(b64, name, mode) {
920
+ mediaState = {b64, name: name || 'file', mode: mode || currentMode};
921
+ renderPreview();
922
  }
923
  window.__setPreview = setPreview;
924
 
925
  function clearPreview() {
926
+ mediaState = null;
927
+ renderPreview();
 
 
 
 
 
 
928
  }
929
  window.__clearPreview = clearPreview;
930
 
931
  function processFile(file) {
932
  if (!file) return;
933
+ if (currentMode === 'image' && !file.type.startsWith('image/')) {
 
 
934
  showToast('Only image files are supported in Image mode', 'error');
935
  return;
936
  }
937
+ if (currentMode === 'video' && !file.type.startsWith('video/')) {
938
  showToast('Only video files are supported in Video mode', 'error');
939
  return;
940
  }
 
941
  const reader = new FileReader();
942
+ reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
943
  reader.readAsDataURL(file);
944
  }
945
 
 
949
  e.target.value = '';
950
  });
951
 
952
+ function updateAccept() {
953
+ fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
954
+ const main = document.getElementById('upload-main-text');
955
+ const sub = document.getElementById('upload-sub-text');
956
+ if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
957
+ if (sub) sub.textContent = currentMode === 'video'
958
+ ? 'Upload one short video clip for multimodal video understanding'
959
+ : 'Upload one document, page, receipt, screenshot, or scene image for vision tasks';
960
+ if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
961
+ }
962
+
963
  if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
964
  if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
965
  if (btnClear) btnClear.addEventListener('click', clearPreview);
 
980
 
981
  promptInput.addEventListener('input', syncPromptToGradio);
982
 
983
+ function activateModelTab(name) {
984
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
985
+ btn.classList.toggle('active', btn.getAttribute('data-model') === name);
 
986
  });
987
+ syncModelToGradio(name);
988
+ }
989
+ window.__activateModelTab = activateModelTab;
990
 
991
+ function activateModeTab(mode) {
992
+ currentMode = mode;
993
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
994
+ btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
995
  });
996
+ syncModeToGradio(mode);
997
+ updateAccept();
998
+ if (mediaState && mediaState.mode !== mode) {
999
+ clearPreview();
1000
+ }
1001
+ }
1002
+ window.__activateModeTab = activateModeTab;
1003
+
1004
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
1005
+ btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
1006
+ });
1007
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
1008
+ btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
1009
  });
1010
 
1011
  activateModelTab('DeepCaption-VLA-7B');
 
1039
 
1040
  function validateBeforeRun() {
1041
  const promptVal = promptInput.value.trim();
1042
+ if (!mediaState && !promptVal) {
1043
+ showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
 
 
1044
  flashPromptError();
1045
  return false;
1046
  }
1047
+ if (!mediaState) {
1048
+ showToast(`Please upload a ${currentMode}`, 'error');
1049
  return false;
1050
  }
1051
+ if (mediaState.mode !== currentMode) {
1052
+ showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
 
1053
  return false;
1054
  }
1055
+ if (!promptVal) {
1056
+ showToast('Please enter your instruction', 'warning');
1057
+ flashPromptError();
1058
  return false;
1059
  }
1060
  const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
 
1068
  window.__clickGradioRunBtn = function() {
1069
  if (!validateBeforeRun()) return;
1070
  syncPromptToGradio();
1071
+ syncMediaToGradio();
1072
  const activeModel = document.querySelector('.model-tab.active');
1073
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1074
+ const activeMode = document.querySelector('.mode-tab.active');
1075
+ if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
1076
  if (outputArea) outputArea.value = '';
1077
  showLoader();
1078
  setTimeout(() => {
 
1092
  const text = outputArea ? outputArea.value : '';
1093
  if (!text.trim()) {
1094
  showToast('No output to copy', 'warning');
1095
+ flashOutputError();
1096
  return;
1097
  }
1098
  await navigator.clipboard.writeText(text);
 
1129
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1130
  card.addEventListener('click', () => {
1131
  const idx = card.getAttribute('data-idx');
 
1132
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1133
  card.classList.add('loading');
1134
  showToast('Loading example...', 'info');
1135
  setGradioValue('example-result-data', '');
 
1136
  setGradioValue('example-idx-input', idx);
1137
  setTimeout(() => {
1138
  const btn = document.getElementById('example-load-btn');
 
1154
  const data = JSON.parse(el.value);
1155
  if (data.status === 'ok') {
1156
  window.__lastExampleVal = el.value;
1157
+ if (data.mode) activateModeTab(data.mode);
1158
+ if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
1159
  if (data.query) {
1160
  promptInput.value = data.query;
1161
  syncPromptToGradio();
1162
  }
1163
  if (data.model) activateModelTab(data.model);
 
1164
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1165
  showToast('Example loaded', 'info');
1166
  } else if (data.status === 'error') {
 
1214
  }
1215
  """
1216
 
1217
+ VISION_LOGO_SVG = """
1218
  <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1219
+ <path d="M12 5C6.5 5 2.1 8.4 1 12c1.1 3.6 5.5 7 11 7s9.9-3.4 11-7c-1.1-3.6-5.5-7-11-7Zm0 11a4 4 0 1 1 0-8 4 4 0 0 1 0 8Zm0-2.2A1.8 1.8 0 1 0 12 10a1.8 1.8 0 0 0 0 3.6Z" fill="white"/>
 
1220
  </svg>
1221
  """
1222
 
 
1237
  ])
1238
 
1239
  MODE_TABS_HTML = """
1240
+ <button class="mode-tab active" data-mode="image">Image Inference</button>
1241
+ <button class="mode-tab" data-mode="video">Video Inference</button>
1242
  """
1243
 
1244
  with gr.Blocks() as demo:
 
1245
  hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1246
+ hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1247
+ hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
1248
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1249
  hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1250
 
 
1257
 
1258
  result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1259
 
 
1260
  example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1261
  example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1262
  example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
 
1265
  <div class="app-shell">
1266
  <div class="app-header">
1267
  <div class="app-header-left">
1268
+ <div class="app-logo">{VISION_LOGO_SVG}</div>
1269
  <span class="app-title">VisionScope R2</span>
1270
  <span class="app-badge">vision enabled</span>
1271
+ <span class="app-badge fast">Image + Video</span>
1272
  </div>
1273
  </div>
1274
 
 
 
 
 
1275
  <div class="model-tabs-bar">
1276
  {MODEL_TABS_HTML}
1277
  </div>
1278
 
1279
+ <div class="mode-tabs-bar">
1280
+ {MODE_TABS_HTML}
1281
+ </div>
1282
+
1283
  <div class="app-main-row">
1284
  <div class="app-main-left">
1285
  <div id="media-drop-zone">
 
1287
  <div id="upload-click-area" class="upload-click-area">
1288
  {UPLOAD_PREVIEW_SVG}
1289
  <span id="upload-main-text" class="upload-main-text">Click or drag an image here</span>
1290
+ <span id="upload-sub-text" class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for vision tasks</span>
1291
  </div>
1292
  </div>
1293
 
1294
+ <input id="custom-file-input" type="file" accept="image/*" style="display:none;" />
1295
 
1296
  <div id="single-preview-wrap" class="single-preview-wrap">
1297
  <div class="single-preview-card">
 
1306
  </div>
1307
 
1308
  <div class="hint-bar">
1309
+ <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
1310
  <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1311
+ <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1312
+ <kbd>Clear</kbd> removes the current media
1313
  </div>
1314
 
1315
  <div class="examples-section">
 
1325
  <div class="panel-card-title">Vision Instruction</div>
1326
  <div class="panel-card-body">
1327
  <label class="modern-label" for="custom-query-input">Query Input</label>
1328
+ <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., describe the scene, read the handwriting, explain the video, summarize frames, extract visible text, estimate distance..."></textarea>
1329
  </div>
1330
  </div>
1331
 
1332
  <div style="padding:12px 20px;">
1333
  <button id="custom-run-btn" class="btn-run">
1334
+ <span id="run-btn-label">Run Inference</span>
1335
  </button>
1336
  </div>
1337
 
 
1346
  <div class="out-body">
1347
  <div class="modern-loader" id="output-loader">
1348
  <div class="loader-spinner"></div>
1349
+ <div class="loader-text">Running inference...</div>
1350
  <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1351
  </div>
1352
  <div class="output-scroll-wrap">
 
1394
  </div>
1395
 
1396
  <div class="exp-note">
1397
+ Experimental vision suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/VisionScope-R2" target="_blank">GitHub</a>
1398
  </div>
1399
 
1400
  <div class="app-statusbar">
 
1406
 
1407
  run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1409
  demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1410
  demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1411
 
1412
  run_btn.click(
1413
+ fn=run_inference,
1414
  inputs=[
1415
  hidden_mode_name,
1416
  hidden_model_name,
1417
  prompt,
1418
+ hidden_image_b64,
1419
+ hidden_video_b64,
1420
  max_new_tokens,
1421
  temperature,
1422
  top_p,
 
1425
  gpu_duration_state,
1426
  ],
1427
  outputs=[result],
1428
+ js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
1429
  const modelEl = document.querySelector('.model-tab.active');
1430
  const modeEl = document.querySelector('.mode-tab.active');
1431
+ const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
1432
+ const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
1433
  const promptEl = document.getElementById('custom-query-input');
1434
  const promptVal = promptEl ? promptEl.value : p;
1435
+
1436
+ let imgVal = img;
1437
+ let vidVal = vid;
1438
+
1439
+ const imgContainer = document.getElementById('hidden-image-b64');
1440
+ const vidContainer = document.getElementById('hidden-video-b64');
1441
+
1442
+ if (imgContainer) {
1443
+ const inner = imgContainer.querySelector('textarea, input');
1444
+ if (inner) imgVal = inner.value;
1445
  }
1446
+ if (vidContainer) {
1447
+ const inner = vidContainer.querySelector('textarea, input');
1448
+ if (inner) vidVal = inner.value;
1449
+ }
1450
+
1451
+ return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
1452
  }""",
1453
  )
1454
 
1455
  example_load_btn.click(
1456
  fn=load_example_data,
1457
+ inputs=[example_idx],
1458
  outputs=[example_result],
1459
  queue=False,
1460
  )