Rename to OneVision Encoder; global top-K; codec-vs-uniform charts
Browse filesUI / brand
- Rename Space and headings to "OneVision Encoder".
- Hero now carries Homepage / Models / Tech Report / Model Card /
Data Card links as pill-style buttons (white -> gradient on hover).
- Card hover lift; tighter typography rhythm.
Algorithm
- Patch budget is now a single global Top-K spent across the whole
video, not per-frame. High-saliency frames may receive many
patches; low-saliency frames may receive zero.
- Slider renamed to "Total patches budget (whole video)" with
explanatory `info` text; range bumped to 64..8192, default 1024.
Charts (gr.Plot)
- Replaced the score histogram with a codec-vs-uniform comparison:
* Left : x = time (s), y = patches selected at that timestamp by
the codec saliency, including zero bars where the budget
skipped the frame.
* Right : x = time (s), y = full grid size, drawn for the
N_uniform = budget // grid frames a uniform sampler
would extract under the same total budget.
- Both panels share scale so the contrast in allocation is obvious.
Run info
- `top_k_per_frame` -> `total_patches_budget`.
- Adds `actual_selected_total` and a `uniform_baseline` block with
frames / patches_per_frame / total_patches / explanation, so the
chart numbers are reproducible from the JSON alone.
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: OneVision
|
| 3 |
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
|
@@ -9,7 +9,7 @@ python_version: '3.13'
|
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: apache-2.0
|
| 12 |
-
short_description:
|
| 13 |
---
|
| 14 |
|
| 15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OneVision Encoder
|
| 3 |
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
|
|
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
license: apache-2.0
|
| 12 |
+
short_description: Codec-style patch saliency for video understanding
|
| 13 |
---
|
| 14 |
|
| 15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
@@ -173,6 +173,7 @@ def compute_score_grids(
|
|
| 173 |
|
| 174 |
|
| 175 |
def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
|
|
|
|
| 176 |
flat = score.flatten()
|
| 177 |
if k >= flat.size:
|
| 178 |
return np.ones_like(score, dtype=np.uint8)
|
|
@@ -182,6 +183,31 @@ def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
|
|
| 182 |
return (score >= thresh).astype(np.uint8)
|
| 183 |
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
|
| 186 |
"""Convert to gray-white wash: gray * (1-fade) + white * fade."""
|
| 187 |
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
|
@@ -319,35 +345,84 @@ def pack_canvas(
|
|
| 319 |
|
| 320 |
|
| 321 |
def make_charts(
|
| 322 |
-
grids: List[np.ndarray],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
):
|
| 324 |
-
"""Two side-by-side panels
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
ax1.
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
|
|
|
|
|
|
| 340 |
counts = [int(m.sum()) for m in masks]
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
ax2.spines[["top", "right"]].set_visible(False)
|
| 352 |
|
| 353 |
fig.patch.set_facecolor("white")
|
|
@@ -358,7 +433,7 @@ def process(
|
|
| 358 |
video_path,
|
| 359 |
sample_frames: int,
|
| 360 |
patch_size: int,
|
| 361 |
-
|
| 362 |
max_pixels: int,
|
| 363 |
viz_mode: str = "selection",
|
| 364 |
heatmap_alpha: float = 0.55,
|
|
@@ -426,7 +501,7 @@ def process(
|
|
| 426 |
grids = compute_score_grids(resized, int(patch_size), saliency_signal)
|
| 427 |
if score_log_scale:
|
| 428 |
grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
|
| 429 |
-
masks =
|
| 430 |
norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
|
| 431 |
|
| 432 |
mode = (viz_mode or "selection").lower()
|
|
@@ -463,12 +538,14 @@ def process(
|
|
| 463 |
cv2.imwrite(canvas_path, canvas)
|
| 464 |
|
| 465 |
hb, wb = grids[0].shape
|
|
|
|
|
|
|
| 466 |
info = {
|
| 467 |
"input": meta,
|
| 468 |
"params": {
|
| 469 |
"sample_frames": int(sample_frames),
|
| 470 |
"patch_size": int(patch_size),
|
| 471 |
-
"
|
| 472 |
"max_pixels": int(max_pixels),
|
| 473 |
"start_sec": float(s_sec),
|
| 474 |
"end_sec": float(e_sec) if e_sec > 0 else None,
|
|
@@ -482,9 +559,19 @@ def process(
|
|
| 482 |
"last_decoded": int(f_end),
|
| 483 |
"actual_frame_ids": [int(x) for x in fids],
|
| 484 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
"resized_frame_size": f"{tw}x{th}",
|
| 486 |
"patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
|
| 487 |
-
"
|
| 488 |
"total_selected_patches": int(n_selected),
|
| 489 |
"canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
|
| 490 |
"vis_video_fps": round(vis_fps, 2),
|
|
@@ -495,7 +582,11 @@ def process(
|
|
| 495 |
"elapsed_sec": round(time.time() - t0, 2),
|
| 496 |
}
|
| 497 |
progress(0.95, desc="Building charts")
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
progress(1.0, desc="Done")
|
| 501 |
return (
|
|
@@ -563,6 +654,37 @@ CUSTOM_CSS = """
|
|
| 563 |
max-width: 760px;
|
| 564 |
line-height: 1.6;
|
| 565 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
#ovc-hero .pills { display:flex; flex-wrap:wrap; gap:8px; justify-content:center; margin-top:8px; }
|
| 567 |
#ovc-hero .pill {
|
| 568 |
font-size: 0.78rem;
|
|
@@ -579,9 +701,14 @@ CUSTOM_CSS = """
|
|
| 579 |
.ovc-card {
|
| 580 |
border-radius: 16px !important;
|
| 581 |
padding: 14px 16px !important;
|
| 582 |
-
border: 1px solid rgba(148,163,184,0.
|
| 583 |
background: var(--background-fill-primary) !important;
|
| 584 |
box-shadow: 0 1px 3px rgba(15,23,42,0.04);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
}
|
| 586 |
.ovc-card h3 {
|
| 587 |
font-size: 0.78rem !important;
|
|
@@ -668,16 +795,23 @@ THEME = gr.themes.Soft(
|
|
| 668 |
|
| 669 |
HERO_HTML = """
|
| 670 |
<div id="ovc-hero">
|
| 671 |
-
<h1>OneVision
|
| 672 |
<p class="tagline">
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
<div class="pills">
|
| 678 |
<span class="pill">selection Β· heatmap Β· side-by-side</span>
|
| 679 |
<span class="pill">gradient + motion saliency</span>
|
| 680 |
-
<span class="pill">
|
| 681 |
</div>
|
| 682 |
</div>
|
| 683 |
"""
|
|
@@ -686,7 +820,7 @@ try:
|
|
| 686 |
_GR_MAJOR = int(gr.__version__.split(".")[0])
|
| 687 |
except Exception:
|
| 688 |
_GR_MAJOR = 4
|
| 689 |
-
_BLOCK_KW: dict = {"title": "OneVision
|
| 690 |
_LAUNCH_KW: dict = {}
|
| 691 |
if _GR_MAJOR >= 6:
|
| 692 |
# In Gradio 6.0 these moved off Blocks(...) onto launch(...).
|
|
@@ -729,7 +863,12 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 729 |
4, 64, value=16, step=1, label="Sampled frames",
|
| 730 |
)
|
| 731 |
top_k = gr.Slider(
|
| 732 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
)
|
| 734 |
patch_size = gr.Radio(
|
| 735 |
PATCH_CHOICES, value=14, label="Patch size (px)",
|
|
@@ -797,7 +936,15 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 797 |
)
|
| 798 |
|
| 799 |
with gr.Group(elem_classes="ovc-card"):
|
| 800 |
-
gr.Markdown("###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
chart_out = gr.Plot(label="", show_label=False)
|
| 802 |
|
| 803 |
with gr.Row():
|
|
@@ -821,9 +968,9 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 821 |
|
| 822 |
gr.HTML(
|
| 823 |
'<div id="ovc-footer">'
|
| 824 |
-
'
|
| 825 |
-
'
|
| 826 |
-
'
|
| 827 |
'</div>'
|
| 828 |
)
|
| 829 |
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
|
| 176 |
+
"""Per-frame top-K mask (legacy helper, no longer used by process())."""
|
| 177 |
flat = score.flatten()
|
| 178 |
if k >= flat.size:
|
| 179 |
return np.ones_like(score, dtype=np.uint8)
|
|
|
|
| 183 |
return (score >= thresh).astype(np.uint8)
|
| 184 |
|
| 185 |
|
| 186 |
+
def global_topk_masks(
|
| 187 |
+
grids: List[np.ndarray], total_k: int,
|
| 188 |
+
) -> Tuple[List[np.ndarray], int]:
|
| 189 |
+
"""Pick the top `total_k` highest-scoring patches GLOBALLY across all
|
| 190 |
+
sampled frames, return one mask per frame plus the actual count.
|
| 191 |
+
|
| 192 |
+
Some frames may end up with zero patches (low energy throughout) while
|
| 193 |
+
others may contribute many β that's the whole point: the codec-style
|
| 194 |
+
saliency lets the budget concentrate where it matters."""
|
| 195 |
+
if not grids:
|
| 196 |
+
return [], 0
|
| 197 |
+
arr = np.stack(grids, axis=0).astype(np.float32) # [N, hb, wb]
|
| 198 |
+
N, hb, wb = arr.shape
|
| 199 |
+
flat = arr.reshape(-1)
|
| 200 |
+
if total_k >= flat.size:
|
| 201 |
+
masks = [np.ones((hb, wb), dtype=np.uint8) for _ in range(N)]
|
| 202 |
+
return masks, int(flat.size)
|
| 203 |
+
if total_k <= 0:
|
| 204 |
+
return [np.zeros((hb, wb), dtype=np.uint8) for _ in range(N)], 0
|
| 205 |
+
thresh = np.partition(flat, -total_k)[-total_k]
|
| 206 |
+
bool_mask = (arr >= thresh)
|
| 207 |
+
actual = int(bool_mask.sum())
|
| 208 |
+
return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
|
| 209 |
+
|
| 210 |
+
|
| 211 |
def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
|
| 212 |
"""Convert to gray-white wash: gray * (1-fade) + white * fade."""
|
| 213 |
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
| 345 |
|
| 346 |
|
| 347 |
def make_charts(
|
| 348 |
+
grids: List[np.ndarray],
|
| 349 |
+
masks: List[np.ndarray],
|
| 350 |
+
frame_ids: List[int],
|
| 351 |
+
fps: float,
|
| 352 |
+
total_duration_sec: float,
|
| 353 |
+
total_patches_budget: int,
|
| 354 |
+
saliency_signal: str,
|
| 355 |
):
|
| 356 |
+
"""Two side-by-side panels comparing codec selection vs uniform sampling.
|
| 357 |
+
|
| 358 |
+
Both panels share x = time (seconds), y = number of patches selected at
|
| 359 |
+
that timestamp. Same total patch budget on both sides β only the
|
| 360 |
+
*allocation* differs.
|
| 361 |
+
|
| 362 |
+
Left (codec): patches go where saliency wants them β often spiky
|
| 363 |
+
and concentrated on a few moments.
|
| 364 |
+
Right (uniform): the same budget is spent on full frames sampled
|
| 365 |
+
evenly in time; every uniform frame contributes the
|
| 366 |
+
full grid's worth of patches.
|
| 367 |
+
"""
|
| 368 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9.2, 3.2), constrained_layout=True)
|
| 369 |
+
|
| 370 |
+
fps_safe = float(fps) if fps and fps > 0 else 25.0
|
| 371 |
+
if grids:
|
| 372 |
+
hb, wb = grids[0].shape
|
| 373 |
+
else:
|
| 374 |
+
hb = wb = 1
|
| 375 |
+
grid_size = hb * wb
|
| 376 |
+
duration = float(total_duration_sec) if total_duration_sec and total_duration_sec > 0 else (
|
| 377 |
+
(max(frame_ids) / fps_safe) if frame_ids else 1.0
|
| 378 |
+
)
|
| 379 |
|
| 380 |
+
# βββ Left: codec selection over time βββββββββββββββββββββββββββββββββ
|
| 381 |
+
times = [fid / fps_safe for fid in frame_ids]
|
| 382 |
counts = [int(m.sum()) for m in masks]
|
| 383 |
+
if not times:
|
| 384 |
+
times, counts = [0.0], [0]
|
| 385 |
+
bar_w = max(duration / max(1, len(times)) * 0.55, 0.04)
|
| 386 |
+
ax1.bar(
|
| 387 |
+
times, counts, width=bar_w,
|
| 388 |
+
color="#4f46e5", alpha=0.88,
|
| 389 |
+
edgecolor="#312e81", linewidth=0.4,
|
| 390 |
+
)
|
| 391 |
+
total_selected = sum(counts)
|
| 392 |
+
ax1.set_title(
|
| 393 |
+
f"Codec selection Β· {saliency_signal} Β· {total_selected} patches",
|
| 394 |
+
fontsize=10, color="#1e293b",
|
| 395 |
+
)
|
| 396 |
+
ax1.set_xlabel("time (s)", fontsize=9)
|
| 397 |
+
ax1.set_ylabel("# patches selected", fontsize=9)
|
| 398 |
+
ax1.set_xlim(-duration * 0.02, duration * 1.02)
|
| 399 |
+
ax1.tick_params(axis="both", labelsize=8)
|
| 400 |
+
ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
|
| 401 |
+
ax1.spines[["top", "right"]].set_visible(False)
|
| 402 |
+
|
| 403 |
+
# βββ Right: uniform-sampling baseline at the same budget ββββββββββββ
|
| 404 |
+
n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
|
| 405 |
+
uniform_times = (
|
| 406 |
+
[duration * 0.5] if n_uniform == 1
|
| 407 |
+
else list(np.linspace(0.0, duration, n_uniform))
|
| 408 |
+
)
|
| 409 |
+
uniform_counts = [grid_size] * n_uniform
|
| 410 |
+
bar_w_u = max(duration / max(1, n_uniform) * 0.55, 0.04)
|
| 411 |
+
ax2.bar(
|
| 412 |
+
uniform_times, uniform_counts, width=bar_w_u,
|
| 413 |
+
color="#06b6d4", alpha=0.88,
|
| 414 |
+
edgecolor="#0e7490", linewidth=0.4,
|
| 415 |
+
)
|
| 416 |
+
ax2.set_title(
|
| 417 |
+
f"Uniform baseline Β· {n_uniform} frames Γ {grid_size} patches "
|
| 418 |
+
f"= {n_uniform * grid_size}",
|
| 419 |
+
fontsize=10, color="#1e293b",
|
| 420 |
+
)
|
| 421 |
+
ax2.set_xlabel("time (s)", fontsize=9)
|
| 422 |
+
ax2.set_ylabel("# patches per uniform frame", fontsize=9)
|
| 423 |
+
ax2.set_xlim(-duration * 0.02, duration * 1.02)
|
| 424 |
+
ax2.tick_params(axis="both", labelsize=8)
|
| 425 |
+
ax2.grid(True, alpha=0.25, linestyle="--", axis="y")
|
| 426 |
ax2.spines[["top", "right"]].set_visible(False)
|
| 427 |
|
| 428 |
fig.patch.set_facecolor("white")
|
|
|
|
| 433 |
video_path,
|
| 434 |
sample_frames: int,
|
| 435 |
patch_size: int,
|
| 436 |
+
total_patches: int,
|
| 437 |
max_pixels: int,
|
| 438 |
viz_mode: str = "selection",
|
| 439 |
heatmap_alpha: float = 0.55,
|
|
|
|
| 501 |
grids = compute_score_grids(resized, int(patch_size), saliency_signal)
|
| 502 |
if score_log_scale:
|
| 503 |
grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
|
| 504 |
+
masks, actual_selected = global_topk_masks(grids, int(total_patches))
|
| 505 |
norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
|
| 506 |
|
| 507 |
mode = (viz_mode or "selection").lower()
|
|
|
|
| 538 |
cv2.imwrite(canvas_path, canvas)
|
| 539 |
|
| 540 |
hb, wb = grids[0].shape
|
| 541 |
+
grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
|
| 542 |
+
n_uniform = max(1, int(int(total_patches) // max(1, grid_size)))
|
| 543 |
info = {
|
| 544 |
"input": meta,
|
| 545 |
"params": {
|
| 546 |
"sample_frames": int(sample_frames),
|
| 547 |
"patch_size": int(patch_size),
|
| 548 |
+
"total_patches_budget": int(total_patches),
|
| 549 |
"max_pixels": int(max_pixels),
|
| 550 |
"start_sec": float(s_sec),
|
| 551 |
"end_sec": float(e_sec) if e_sec > 0 else None,
|
|
|
|
| 559 |
"last_decoded": int(f_end),
|
| 560 |
"actual_frame_ids": [int(x) for x in fids],
|
| 561 |
},
|
| 562 |
+
"uniform_baseline": {
|
| 563 |
+
"frames": int(n_uniform),
|
| 564 |
+
"patches_per_frame": int(grid_size),
|
| 565 |
+
"total_patches": int(n_uniform * grid_size),
|
| 566 |
+
"explanation": (
|
| 567 |
+
"Same patch budget spent on whole frames sampled uniformly in "
|
| 568 |
+
"time β every patch in those frames kept, vs codec's selective "
|
| 569 |
+
"concentration on high-saliency patches."
|
| 570 |
+
),
|
| 571 |
+
},
|
| 572 |
"resized_frame_size": f"{tw}x{th}",
|
| 573 |
"patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
|
| 574 |
+
"actual_selected_total": int(actual_selected),
|
| 575 |
"total_selected_patches": int(n_selected),
|
| 576 |
"canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
|
| 577 |
"vis_video_fps": round(vis_fps, 2),
|
|
|
|
| 582 |
"elapsed_sec": round(time.time() - t0, 2),
|
| 583 |
}
|
| 584 |
progress(0.95, desc="Building charts")
|
| 585 |
+
duration_sec = (total / fps) if fps > 0 else 0.0
|
| 586 |
+
chart_fig = make_charts(
|
| 587 |
+
grids, masks, fids, fps, duration_sec,
|
| 588 |
+
int(total_patches), saliency_signal,
|
| 589 |
+
)
|
| 590 |
|
| 591 |
progress(1.0, desc="Done")
|
| 592 |
return (
|
|
|
|
| 654 |
max-width: 760px;
|
| 655 |
line-height: 1.6;
|
| 656 |
}
|
| 657 |
+
.ovc-links {
|
| 658 |
+
display: flex; flex-wrap: wrap; gap: 10px;
|
| 659 |
+
justify-content: center; margin: 14px auto 6px;
|
| 660 |
+
position: relative; z-index: 1;
|
| 661 |
+
}
|
| 662 |
+
.ovc-links a {
|
| 663 |
+
text-decoration: none;
|
| 664 |
+
font-weight: 600;
|
| 665 |
+
font-size: 0.9rem;
|
| 666 |
+
padding: 7px 14px;
|
| 667 |
+
border-radius: 999px;
|
| 668 |
+
background: var(--background-fill-primary, #fff);
|
| 669 |
+
border: 1px solid rgba(99,102,241,0.32);
|
| 670 |
+
color: #4338ca;
|
| 671 |
+
transition: transform 0.12s ease, box-shadow 0.18s ease,
|
| 672 |
+
background 0.18s ease, color 0.18s ease, border-color 0.18s ease;
|
| 673 |
+
display: inline-flex; align-items: center;
|
| 674 |
+
box-shadow: 0 1px 2px rgba(15,23,42,0.04);
|
| 675 |
+
}
|
| 676 |
+
.ovc-links a:hover {
|
| 677 |
+
background: var(--ovc-grad);
|
| 678 |
+
color: #fff;
|
| 679 |
+
border-color: transparent;
|
| 680 |
+
transform: translateY(-1px);
|
| 681 |
+
box-shadow: 0 6px 16px rgba(79,70,229,0.32);
|
| 682 |
+
}
|
| 683 |
+
.gradio-container.dark .ovc-links a {
|
| 684 |
+
background: rgba(30,41,59,0.7);
|
| 685 |
+
color: #c7d2fe;
|
| 686 |
+
border-color: rgba(99,102,241,0.4);
|
| 687 |
+
}
|
| 688 |
#ovc-hero .pills { display:flex; flex-wrap:wrap; gap:8px; justify-content:center; margin-top:8px; }
|
| 689 |
#ovc-hero .pill {
|
| 690 |
font-size: 0.78rem;
|
|
|
|
| 701 |
.ovc-card {
|
| 702 |
border-radius: 16px !important;
|
| 703 |
padding: 14px 16px !important;
|
| 704 |
+
border: 1px solid rgba(148,163,184,0.28) !important;
|
| 705 |
background: var(--background-fill-primary) !important;
|
| 706 |
box-shadow: 0 1px 3px rgba(15,23,42,0.04);
|
| 707 |
+
transition: box-shadow 0.18s ease, border-color 0.18s ease;
|
| 708 |
+
}
|
| 709 |
+
.ovc-card:hover {
|
| 710 |
+
border-color: rgba(99,102,241,0.32) !important;
|
| 711 |
+
box-shadow: 0 6px 18px rgba(15,23,42,0.06);
|
| 712 |
}
|
| 713 |
.ovc-card h3 {
|
| 714 |
font-size: 0.78rem !important;
|
|
|
|
| 795 |
|
| 796 |
HERO_HTML = """
|
| 797 |
<div id="ovc-hero">
|
| 798 |
+
<h1>OneVision Encoder</h1>
|
| 799 |
<p class="tagline">
|
| 800 |
+
Codec-style patch saliency for video understanding — see which
|
| 801 |
+
patches the encoder picks from your video and pack them into the
|
| 802 |
+
canvas LLaVA-OneVision consumes.
|
| 803 |
</p>
|
| 804 |
+
<div class="ovc-links">
|
| 805 |
+
<a href="https://www.lmms-lab.com/onevision-encoder/index.html" target="_blank" rel="noopener">π Homepage</a>
|
| 806 |
+
<a href="https://huggingface.co/collections/lmms-lab-encoder/onevision-encoder" target="_blank" rel="noopener">π€ Models</a>
|
| 807 |
+
<a href="https://arxiv.org/abs/2602.08683" target="_blank" rel="noopener">π Tech Report</a>
|
| 808 |
+
<a href="docs/model_card.md" target="_blank" rel="noopener">π Model Card</a>
|
| 809 |
+
<a href="docs/data_card.md" target="_blank" rel="noopener">π Data Card</a>
|
| 810 |
+
</div>
|
| 811 |
<div class="pills">
|
| 812 |
<span class="pill">selection Β· heatmap Β· side-by-side</span>
|
| 813 |
<span class="pill">gradient + motion saliency</span>
|
| 814 |
+
<span class="pill">codec vs uniform baseline</span>
|
| 815 |
</div>
|
| 816 |
</div>
|
| 817 |
"""
|
|
|
|
| 820 |
_GR_MAJOR = int(gr.__version__.split(".")[0])
|
| 821 |
except Exception:
|
| 822 |
_GR_MAJOR = 4
|
| 823 |
+
_BLOCK_KW: dict = {"title": "OneVision Encoder"}
|
| 824 |
_LAUNCH_KW: dict = {}
|
| 825 |
if _GR_MAJOR >= 6:
|
| 826 |
# In Gradio 6.0 these moved off Blocks(...) onto launch(...).
|
|
|
|
| 863 |
4, 64, value=16, step=1, label="Sampled frames",
|
| 864 |
)
|
| 865 |
top_k = gr.Slider(
|
| 866 |
+
64, 8192, value=1024, step=32,
|
| 867 |
+
label="Total patches budget (whole video)",
|
| 868 |
+
info="The single budget shared across the whole video. "
|
| 869 |
+
"The codec saliency picks these patches GLOBALLY β "
|
| 870 |
+
"high-energy frames may contribute many, low-energy "
|
| 871 |
+
"frames may contribute zero.",
|
| 872 |
)
|
| 873 |
patch_size = gr.Radio(
|
| 874 |
PATCH_CHOICES, value=14, label="Patch size (px)",
|
|
|
|
| 936 |
)
|
| 937 |
|
| 938 |
with gr.Group(elem_classes="ovc-card"):
|
| 939 |
+
gr.Markdown("### Codec selection vs uniform baseline")
|
| 940 |
+
gr.Markdown(
|
| 941 |
+
"<small>Same total patch budget on both sides. "
|
| 942 |
+
"<b>Left</b>: where the codec saliency actually placed "
|
| 943 |
+
"patches (often spiky, concentrated on important moments). "
|
| 944 |
+
"<b>Right</b>: how a naive uniform frame sampler would "
|
| 945 |
+
"spend the same budget β spread evenly in time, every "
|
| 946 |
+
"patch in the chosen frames kept.</small>"
|
| 947 |
+
)
|
| 948 |
chart_out = gr.Plot(label="", show_label=False)
|
| 949 |
|
| 950 |
with gr.Row():
|
|
|
|
| 968 |
|
| 969 |
gr.HTML(
|
| 970 |
'<div id="ovc-footer">'
|
| 971 |
+
'<b>OneVision Encoder</b> Β· codec-style patch saliency demo Β· '
|
| 972 |
+
'Sobel + frame-diff stand in for the ffmpeg bitcost patch Β· '
|
| 973 |
+
'global top-K selection across all sampled frames.'
|
| 974 |
'</div>'
|
| 975 |
)
|
| 976 |
|