Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on about 13 hours ago

Commit

67e22fc

1 Parent(s): 27a601e

Rename to OneVision Encoder; global top-K; codec-vs-uniform charts

UI / brand
- Rename Space and headings to "OneVision Encoder".
- Hero now carries Homepage / Models / Tech Report / Model Card /
Data Card links as pill-style buttons (white -> gradient on hover).
- Card hover lift; tighter typography rhythm.

Algorithm
- Patch budget is now a single global Top-K spent across the whole
video, not per-frame. High-saliency frames may receive many
patches; low-saliency frames may receive zero.
- Slider renamed to "Total patches budget (whole video)" with
explanatory `info` text; range bumped to 64..8192, default 1024.

Charts (gr.Plot)
- Replaced the score histogram with a codec-vs-uniform comparison:
* Left : x = time (s), y = patches selected at that timestamp by
the codec saliency, including zero bars where the budget
skipped the frame.
* Right : x = time (s), y = full grid size, drawn for the
N_uniform = budget // grid frames a uniform sampler
would extract under the same total budget.
- Both panels share scale so the contrast in allocation is obvious.

Run info
- `top_k_per_frame` -> `total_patches_budget`.
- Adds `actual_selected_total` and a `uniform_baseline` block with
frames / patches_per_frame / total_patches / explanation, so the
chart numbers are reproducible from the JSON alone.

Files changed (2) hide show

README.md +2 -2
app.py +190 -43

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: OneVision Patch Inspector
 emoji: 🏆
 colorFrom: blue
 colorTo: indigo
@@ -9,7 +9,7 @@ python_version: '3.13'
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Inspect which video patches a codec-style saliency picks
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OneVision Encoder
 emoji: 🏆
 colorFrom: blue
 colorTo: indigo
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Codec-style patch saliency for video understanding
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -173,6 +173,7 @@ def compute_score_grids(
 def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
     flat = score.flatten()
     if k >= flat.size:
         return np.ones_like(score, dtype=np.uint8)
@@ -182,6 +183,31 @@ def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
     return (score >= thresh).astype(np.uint8)
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
     """Convert to gray-white wash: gray * (1-fade) + white * fade."""
     gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
@@ -319,35 +345,84 @@ def pack_canvas(
 def make_charts(
-    grids: List[np.ndarray], masks: List[np.ndarray], saliency_signal: str,
 ):
-    """Two side-by-side panels:
-       - patch score histogram across all sampled frames (log-y)
-       - selected patch count per sampled frame index"""
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8.6, 3.0), constrained_layout=True)
-    all_scores = np.concatenate([g.flatten().astype(np.float32) for g in grids])
-    ax1.hist(all_scores, bins=40, color="#4f46e5", alpha=0.85,
-             edgecolor="#312e81", linewidth=0.4)
-    ax1.set_yscale("log")
-    ax1.set_title(f"Patch score distribution  ·  {saliency_signal}",
-                  fontsize=10, color="#1e293b")
-    ax1.set_xlabel("score", fontsize=9)
-    ax1.set_ylabel("patches  (log)", fontsize=9)
-    ax1.tick_params(axis="both", which="both", labelsize=8)
-    ax1.grid(True, alpha=0.25, linestyle="--")
-    ax1.spines[["top", "right"]].set_visible(False)
     counts = [int(m.sum()) for m in masks]
-    xs = list(range(len(counts)))
-    ax2.plot(xs, counts, "o-", color="#06b6d4", linewidth=2.0,
-             markersize=5, markeredgecolor="#0e7490")
-    ax2.fill_between(xs, counts, alpha=0.15, color="#06b6d4")
-    ax2.set_title("Selected patches per sampled frame",
-                  fontsize=10, color="#1e293b")
-    ax2.set_xlabel("sampled frame index", fontsize=9)
-    ax2.set_ylabel("# selected", fontsize=9)
-    ax2.tick_params(axis="both", which="both", labelsize=8)
-    ax2.grid(True, alpha=0.25, linestyle="--")
     ax2.spines[["top", "right"]].set_visible(False)
     fig.patch.set_facecolor("white")
@@ -358,7 +433,7 @@ def process(
     video_path,
     sample_frames: int,
     patch_size: int,
-    top_k_per_frame: int,
     max_pixels: int,
     viz_mode: str = "selection",
     heatmap_alpha: float = 0.55,
@@ -426,7 +501,7 @@ def process(
     grids = compute_score_grids(resized, int(patch_size), saliency_signal)
     if score_log_scale:
         grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
-    masks = [topk_mask(g, int(top_k_per_frame)) for g in grids]
     norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
     mode = (viz_mode or "selection").lower()
@@ -463,12 +538,14 @@ def process(
     cv2.imwrite(canvas_path, canvas)
     hb, wb = grids[0].shape
     info = {
         "input": meta,
         "params": {
             "sample_frames": int(sample_frames),
             "patch_size": int(patch_size),
-            "top_k_per_frame": int(top_k_per_frame),
             "max_pixels": int(max_pixels),
             "start_sec": float(s_sec),
             "end_sec": float(e_sec) if e_sec > 0 else None,
@@ -482,9 +559,19 @@ def process(
             "last_decoded": int(f_end),
             "actual_frame_ids": [int(x) for x in fids],
         },
         "resized_frame_size": f"{tw}x{th}",
         "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
-        "selected_per_frame": int(min(top_k_per_frame, hb * wb)),
         "total_selected_patches": int(n_selected),
         "canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
         "vis_video_fps": round(vis_fps, 2),
@@ -495,7 +582,11 @@ def process(
         "elapsed_sec": round(time.time() - t0, 2),
     }
     progress(0.95, desc="Building charts")
-    chart_fig = make_charts(grids, masks, saliency_signal)
     progress(1.0, desc="Done")
     return (
@@ -563,6 +654,37 @@ CUSTOM_CSS = """
     max-width: 760px;
     line-height: 1.6;
 }
 #ovc-hero .pills { display:flex; flex-wrap:wrap; gap:8px; justify-content:center; margin-top:8px; }
 #ovc-hero .pill {
     font-size: 0.78rem;
@@ -579,9 +701,14 @@ CUSTOM_CSS = """
 .ovc-card {
     border-radius: 16px !important;
     padding: 14px 16px !important;
-    border: 1px solid rgba(148,163,184,0.30) !important;
     background: var(--background-fill-primary) !important;
     box-shadow: 0 1px 3px rgba(15,23,42,0.04);
 }
 .ovc-card h3 {
     font-size: 0.78rem !important;
@@ -668,16 +795,23 @@ THEME = gr.themes.Soft(
 HERO_HTML = """
 <div id="ovc-hero">
-  <h1>OneVision Patch Inspector</h1>
   <p class="tagline">
-    See which patches a codec-style saliency picks from your video,
-    then pack them into the canvas LLaVA-OneVision2 consumes.
-    A visual lab for inspecting <i>where</i> the encoder &mdash; and the model &mdash; is actually looking.
   </p>
   <div class="pills">
     <span class="pill">selection · heatmap · side-by-side</span>
     <span class="pill">gradient + motion saliency</span>
-    <span class="pill">presets · charts · canvas export</span>
   </div>
 </div>
 """
@@ -686,7 +820,7 @@ try:
     _GR_MAJOR = int(gr.__version__.split(".")[0])
 except Exception:
     _GR_MAJOR = 4
-_BLOCK_KW: dict = {"title": "OneVision Patch Inspector"}
 _LAUNCH_KW: dict = {}
 if _GR_MAJOR >= 6:
     # In Gradio 6.0 these moved off Blocks(...) onto launch(...).
@@ -729,7 +863,12 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                     4, 64, value=16, step=1, label="Sampled frames",
                 )
                 top_k = gr.Slider(
-                    4, 1024, value=64, step=4, label="Top-K patches per frame",
                 )
                 patch_size = gr.Radio(
                     PATCH_CHOICES, value=14, label="Patch size (px)",
@@ -797,7 +936,15 @@ with gr.Blocks(**_BLOCK_KW) as demo:
                 )
             with gr.Group(elem_classes="ovc-card"):
-                gr.Markdown("### Score distribution & per-frame patch count")
                 chart_out = gr.Plot(label="", show_label=False)
             with gr.Row():
@@ -821,9 +968,9 @@ with gr.Blocks(**_BLOCK_KW) as demo:
     gr.HTML(
         '<div id="ovc-footer">'
-        'Approximation of the bitcost-driven patch selection in '
-        '<code>codec_tools/</code> · Sobel + frame-diff used as a stand-in '
-        'for the ffmpeg bitcost patch.'
         '</div>'
     )

 def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
+    """Per-frame top-K mask (legacy helper, no longer used by process())."""
     flat = score.flatten()
     if k >= flat.size:
         return np.ones_like(score, dtype=np.uint8)
     return (score >= thresh).astype(np.uint8)
+def global_topk_masks(
+    grids: List[np.ndarray], total_k: int,
+) -> Tuple[List[np.ndarray], int]:
+    """Pick the top `total_k` highest-scoring patches GLOBALLY across all
+    sampled frames, return one mask per frame plus the actual count.
+    Some frames may end up with zero patches (low energy throughout) while
+    others may contribute many — that's the whole point: the codec-style
+    saliency lets the budget concentrate where it matters."""
+    if not grids:
+        return [], 0
+    arr = np.stack(grids, axis=0).astype(np.float32)  # [N, hb, wb]
+    N, hb, wb = arr.shape
+    flat = arr.reshape(-1)
+    if total_k >= flat.size:
+        masks = [np.ones((hb, wb), dtype=np.uint8) for _ in range(N)]
+        return masks, int(flat.size)
+    if total_k <= 0:
+        return [np.zeros((hb, wb), dtype=np.uint8) for _ in range(N)], 0
+    thresh = np.partition(flat, -total_k)[-total_k]
+    bool_mask = (arr >= thresh)
+    actual = int(bool_mask.sum())
+    return [bool_mask[i].astype(np.uint8) for i in range(N)], actual
 def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
     """Convert to gray-white wash: gray * (1-fade) + white * fade."""
     gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
 def make_charts(
+    grids: List[np.ndarray],
+    masks: List[np.ndarray],
+    frame_ids: List[int],
+    fps: float,
+    total_duration_sec: float,
+    total_patches_budget: int,
+    saliency_signal: str,
 ):
+    """Two side-by-side panels comparing codec selection vs uniform sampling.
+    Both panels share x = time (seconds), y = number of patches selected at
+    that timestamp. Same total patch budget on both sides — only the
+    *allocation* differs.
+      Left  (codec):    patches go where saliency wants them — often spiky
+                        and concentrated on a few moments.
+      Right (uniform):  the same budget is spent on full frames sampled
+                        evenly in time; every uniform frame contributes the
+                        full grid's worth of patches.
+    """
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9.2, 3.2), constrained_layout=True)
+    fps_safe = float(fps) if fps and fps > 0 else 25.0
+    if grids:
+        hb, wb = grids[0].shape
+    else:
+        hb = wb = 1
+    grid_size = hb * wb
+    duration = float(total_duration_sec) if total_duration_sec and total_duration_sec > 0 else (
+        (max(frame_ids) / fps_safe) if frame_ids else 1.0
+    )
+    # ─── Left: codec selection over time ─────────────────────────────────
+    times = [fid / fps_safe for fid in frame_ids]
     counts = [int(m.sum()) for m in masks]
+    if not times:
+        times, counts = [0.0], [0]
+    bar_w = max(duration / max(1, len(times)) * 0.55, 0.04)
+    ax1.bar(
+        times, counts, width=bar_w,
+        color="#4f46e5", alpha=0.88,
+        edgecolor="#312e81", linewidth=0.4,
+    )
+    total_selected = sum(counts)
+    ax1.set_title(
+        f"Codec selection · {saliency_signal} · {total_selected} patches",
+        fontsize=10, color="#1e293b",
+    )
+    ax1.set_xlabel("time (s)", fontsize=9)
+    ax1.set_ylabel("# patches selected", fontsize=9)
+    ax1.set_xlim(-duration * 0.02, duration * 1.02)
+    ax1.tick_params(axis="both", labelsize=8)
+    ax1.grid(True, alpha=0.25, linestyle="--", axis="y")
+    ax1.spines[["top", "right"]].set_visible(False)
+    # ─── Right: uniform-sampling baseline at the same budget ────────────
+    n_uniform = max(1, int(total_patches_budget // max(1, grid_size)))
+    uniform_times = (
+        [duration * 0.5] if n_uniform == 1
+        else list(np.linspace(0.0, duration, n_uniform))
+    )
+    uniform_counts = [grid_size] * n_uniform
+    bar_w_u = max(duration / max(1, n_uniform) * 0.55, 0.04)
+    ax2.bar(
+        uniform_times, uniform_counts, width=bar_w_u,
+        color="#06b6d4", alpha=0.88,
+        edgecolor="#0e7490", linewidth=0.4,
+    )
+    ax2.set_title(
+        f"Uniform baseline · {n_uniform} frames × {grid_size} patches "
+        f"= {n_uniform * grid_size}",
+        fontsize=10, color="#1e293b",
+    )
+    ax2.set_xlabel("time (s)", fontsize=9)
+    ax2.set_ylabel("# patches per uniform frame", fontsize=9)
+    ax2.set_xlim(-duration * 0.02, duration * 1.02)
+    ax2.tick_params(axis="both", labelsize=8)
+    ax2.grid(True, alpha=0.25, linestyle="--", axis="y")
     ax2.spines[["top", "right"]].set_visible(False)
     fig.patch.set_facecolor("white")
     video_path,
     sample_frames: int,
     patch_size: int,
+    total_patches: int,
     max_pixels: int,
     viz_mode: str = "selection",
     heatmap_alpha: float = 0.55,
     grids = compute_score_grids(resized, int(patch_size), saliency_signal)
     if score_log_scale:
         grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
+    masks, actual_selected = global_topk_masks(grids, int(total_patches))
     norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
     mode = (viz_mode or "selection").lower()
     cv2.imwrite(canvas_path, canvas)
     hb, wb = grids[0].shape
+    grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
+    n_uniform = max(1, int(int(total_patches) // max(1, grid_size)))
     info = {
         "input": meta,
         "params": {
             "sample_frames": int(sample_frames),
             "patch_size": int(patch_size),
+            "total_patches_budget": int(total_patches),
             "max_pixels": int(max_pixels),
             "start_sec": float(s_sec),
             "end_sec": float(e_sec) if e_sec > 0 else None,
             "last_decoded": int(f_end),
             "actual_frame_ids": [int(x) for x in fids],
         },
+        "uniform_baseline": {
+            "frames": int(n_uniform),
+            "patches_per_frame": int(grid_size),
+            "total_patches": int(n_uniform * grid_size),
+            "explanation": (
+                "Same patch budget spent on whole frames sampled uniformly in "
+                "time — every patch in those frames kept, vs codec's selective "
+                "concentration on high-saliency patches."
+            ),
+        },
         "resized_frame_size": f"{tw}x{th}",
         "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
+        "actual_selected_total": int(actual_selected),
         "total_selected_patches": int(n_selected),
         "canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
         "vis_video_fps": round(vis_fps, 2),
         "elapsed_sec": round(time.time() - t0, 2),
     }
     progress(0.95, desc="Building charts")
+    duration_sec = (total / fps) if fps > 0 else 0.0
+    chart_fig = make_charts(
+        grids, masks, fids, fps, duration_sec,
+        int(total_patches), saliency_signal,
+    )
     progress(1.0, desc="Done")
     return (
     max-width: 760px;
     line-height: 1.6;
 }
+.ovc-links {
+    display: flex; flex-wrap: wrap; gap: 10px;
+    justify-content: center; margin: 14px auto 6px;
+    position: relative; z-index: 1;
+}
+.ovc-links a {
+    text-decoration: none;
+    font-weight: 600;
+    font-size: 0.9rem;
+    padding: 7px 14px;
+    border-radius: 999px;
+    background: var(--background-fill-primary, #fff);
+    border: 1px solid rgba(99,102,241,0.32);
+    color: #4338ca;
+    transition: transform 0.12s ease, box-shadow 0.18s ease,
+                background 0.18s ease, color 0.18s ease, border-color 0.18s ease;
+    display: inline-flex; align-items: center;
+    box-shadow: 0 1px 2px rgba(15,23,42,0.04);
+}
+.ovc-links a:hover {
+    background: var(--ovc-grad);
+    color: #fff;
+    border-color: transparent;
+    transform: translateY(-1px);
+    box-shadow: 0 6px 16px rgba(79,70,229,0.32);
+}
+.gradio-container.dark .ovc-links a {
+    background: rgba(30,41,59,0.7);
+    color: #c7d2fe;
+    border-color: rgba(99,102,241,0.4);
+}
 #ovc-hero .pills { display:flex; flex-wrap:wrap; gap:8px; justify-content:center; margin-top:8px; }
 #ovc-hero .pill {
     font-size: 0.78rem;
 .ovc-card {
     border-radius: 16px !important;
     padding: 14px 16px !important;
+    border: 1px solid rgba(148,163,184,0.28) !important;
     background: var(--background-fill-primary) !important;
     box-shadow: 0 1px 3px rgba(15,23,42,0.04);
+    transition: box-shadow 0.18s ease, border-color 0.18s ease;
+}
+.ovc-card:hover {
+    border-color: rgba(99,102,241,0.32) !important;
+    box-shadow: 0 6px 18px rgba(15,23,42,0.06);
 }
 .ovc-card h3 {
     font-size: 0.78rem !important;
 HERO_HTML = """
 <div id="ovc-hero">
+  <h1>OneVision Encoder</h1>
   <p class="tagline">
+    Codec-style patch saliency for video understanding &mdash; see which
+    patches the encoder picks from your video and pack them into the
+    canvas LLaVA-OneVision consumes.
   </p>
+  <div class="ovc-links">
+    <a href="https://www.lmms-lab.com/onevision-encoder/index.html" target="_blank" rel="noopener">📝&nbsp;Homepage</a>
+    <a href="https://huggingface.co/collections/lmms-lab-encoder/onevision-encoder" target="_blank" rel="noopener">🤗&nbsp;Models</a>
+    <a href="https://arxiv.org/abs/2602.08683" target="_blank" rel="noopener">📄&nbsp;Tech Report</a>
+    <a href="docs/model_card.md" target="_blank" rel="noopener">📋&nbsp;Model Card</a>
+    <a href="docs/data_card.md" target="_blank" rel="noopener">📊&nbsp;Data Card</a>
+  </div>
   <div class="pills">
     <span class="pill">selection · heatmap · side-by-side</span>
     <span class="pill">gradient + motion saliency</span>
+    <span class="pill">codec vs uniform baseline</span>
   </div>
 </div>
 """
     _GR_MAJOR = int(gr.__version__.split(".")[0])
 except Exception:
     _GR_MAJOR = 4
+_BLOCK_KW: dict = {"title": "OneVision Encoder"}
 _LAUNCH_KW: dict = {}
 if _GR_MAJOR >= 6:
     # In Gradio 6.0 these moved off Blocks(...) onto launch(...).
                     4, 64, value=16, step=1, label="Sampled frames",
                 )
                 top_k = gr.Slider(
+                    64, 8192, value=1024, step=32,
+                    label="Total patches budget (whole video)",
+                    info="The single budget shared across the whole video. "
+                         "The codec saliency picks these patches GLOBALLY — "
+                         "high-energy frames may contribute many, low-energy "
+                         "frames may contribute zero.",
                 )
                 patch_size = gr.Radio(
                     PATCH_CHOICES, value=14, label="Patch size (px)",
                 )
             with gr.Group(elem_classes="ovc-card"):
+                gr.Markdown("### Codec selection vs uniform baseline")
+                gr.Markdown(
+                    "<small>Same total patch budget on both sides. "
+                    "<b>Left</b>: where the codec saliency actually placed "
+                    "patches (often spiky, concentrated on important moments). "
+                    "<b>Right</b>: how a naive uniform frame sampler would "
+                    "spend the same budget — spread evenly in time, every "
+                    "patch in the chosen frames kept.</small>"
+                )
                 chart_out = gr.Plot(label="", show_label=False)
             with gr.Row():
     gr.HTML(
         '<div id="ovc-footer">'
+        '<b>OneVision Encoder</b> · codec-style patch saliency demo · '
+        'Sobel + frame-diff stand in for the ffmpeg bitcost patch · '
+        'global top-K selection across all sampled frames.'
         '</div>'
     )