Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on 1 day ago

Commit

36a9b0a

1 Parent(s): 14590e3

Codec patch selection demo: visualization + canvas

- Replace probe with the codec_tools-style pipeline:
uniform sample -> smart_resize -> per-patch saliency ->
top-K selection -> visualization video + packed canvas.
- Three viz modes: selection (kept-in-color, dropped fade-to-gray),
heatmap (full-frame JET overlay), sbs (side-by-side).
- Saliency: gradient (Sobel), frame_diff (motion), or combined.
- Tunables: time window (start/end sec), top-K, patch size,
max_pixels, log1p scoring, percentile normalization, fade
strength, heatmap blend alpha.
- Designer pass on UI: indigo Soft theme, hero gradient title,
card-grouped controls, prominent Run button, output-priority
layout, footer credit.

Files changed (3) hide show

.gitignore +20 -0
app.py +655 -61
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+# Virtualenvs (local dev only)
+.venv/
+venv/
+env/
+# OS
+.DS_Store
+# Editors
+.vscode/
+.idea/
+# Local outputs
+codec_view_outputs/
+*.log

app.py CHANGED Viewed

@@ -1,83 +1,677 @@
 import json
 import os
 import shutil
 import subprocess
 import gradio as gr
-def probe_video(video_path):
     if not video_path:
-        return "Please upload a video.", None
-    if shutil.which("ffprobe") is None:
-        return (
-            "ffprobe not found. Add `ffmpeg` to packages.txt at the repo root.",
-            None,
         )
-    try:
-        result = subprocess.run(
-            [
-                "ffprobe",
-                "-v", "quiet",
-                "-print_format", "json",
-                "-show_format",
-                "-show_streams",
-                video_path,
-            ],
-            capture_output=True,
-            text=True,
-            check=True,
         )
-    except subprocess.CalledProcessError as e:
-        return f"ffprobe failed:\n{e.stderr}", None
-    info = json.loads(result.stdout)
-    fmt = info.get("format", {}) or {}
-    streams = info.get("streams", []) or []
-    v = next((s for s in streams if s.get("codec_type") == "video"), {})
-    a = next((s for s in streams if s.get("codec_type") == "audio"), {})
-    summary = {
-        "filename": os.path.basename(fmt.get("filename", "")),
-        "format": fmt.get("format_long_name") or fmt.get("format_name"),
-        "duration_sec": fmt.get("duration"),
-        "size_bytes": fmt.get("size"),
-        "overall_bitrate_bps": fmt.get("bit_rate"),
-        "video": {
-            "codec": v.get("codec_name"),
-            "profile": v.get("profile"),
-            "width": v.get("width"),
-            "height": v.get("height"),
-            "pix_fmt": v.get("pix_fmt"),
-            "frame_rate": v.get("r_frame_rate"),
-            "bitrate_bps": v.get("bit_rate"),
         },
-        "audio": {
-            "codec": a.get("codec_name"),
-            "sample_rate": a.get("sample_rate"),
-            "channels": a.get("channels"),
-            "bitrate_bps": a.get("bit_rate"),
         },
     }
-    return json.dumps(summary, indent=2, ensure_ascii=False), video_path
-with gr.Blocks(title="OneVision Encoder Codec View") as demo:
-    gr.Markdown(
-        "# OneVision Encoder Codec View\n"
-        "Upload a video to inspect its container / codec metadata via `ffprobe`."
     )
-    with gr.Row():
-        with gr.Column():
-            video_in = gr.Video(label="Input video", sources=["upload"])
-            run_btn = gr.Button("Probe", variant="primary")
-        with gr.Column():
-            video_out = gr.Video(label="Preview")
-            info_out = gr.Code(label="Metadata (JSON)", language="json")
-    run_btn.click(probe_video, inputs=video_in, outputs=[info_out, video_out])
 if __name__ == "__main__":
-    demo.launch()

+"""OneVision Encoder Codec View.
+A simplified, dependency-light port of the codec_tools pipeline from
+lmms-eval-ov2. The original tool relies on a bitcost-patched ffmpeg 5.1 to
+score every macroblock by its actual encoding bit cost; we approximate that
+saliency signal with a Sobel gradient magnitude per patch (high gradient =
+high local complexity = roughly what the encoder would spend bits on).
+Pipeline (mirrors codec_tools/pipeline/process_video_bitcost_readiness.py):
+    1. Uniformly sample N frames from the input video.
+    2. smart_resize each frame so dims are multiples of `patch` and the
+       total pixel count <= max_pixels.
+    3. Slice every frame into a patch grid; score each patch by its
+       Sobel gradient magnitude mean.
+    4. Pick the top-K highest-scoring patches per frame.
+    5. Render a "selection visualization" video: kept patches stay in
+       full color, dropped patches are faded to a gray-white wash so the
+       viewer can see exactly which patches the codec stage chose.
+    6. Pack the selected patches in time-order, raster scan, into a
+       single canvas image (the artifact LLaVA-OneVision2 consumes).
+"""
 import json
+import math
 import os
 import shutil
 import subprocess
+import tempfile
+import time
+from typing import List, Tuple
+import cv2
 import gradio as gr
+import imageio_ffmpeg
+import numpy as np
+PATCH_CHOICES = [14, 16, 28]
+def smart_resize(frame: np.ndarray, max_pixels: int, factor: int) -> np.ndarray:
+    """Resize so h,w are multiples of `factor` and h*w <= max_pixels."""
+    h, w = frame.shape[:2]
+    pixels = h * w
+    if pixels > max_pixels:
+        scale = math.sqrt(max_pixels / pixels)
+        h = max(factor, int(h * scale))
+        w = max(factor, int(w * scale))
+    h = max(factor, (h // factor) * factor)
+    w = max(factor, (w // factor) * factor)
+    return cv2.resize(frame, (w, h), interpolation=cv2.INTER_AREA)
+def sample_frame_ids(total: int, n: int) -> List[int]:
+    if total <= 0:
+        return []
+    if n >= total:
+        return list(range(total))
+    return [int(round(i)) for i in np.linspace(0, total - 1, n)]
+def decode_frames(video_path: str, frame_ids: List[int]) -> List[np.ndarray]:
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return []
+    frames: List[np.ndarray] = []
+    for fid in frame_ids:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, int(fid))
+        ok, fr = cap.read()
+        if ok:
+            frames.append(fr)
+    cap.release()
+    return frames
+def video_metadata(video_path: str) -> dict:
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = float(cap.get(cv2.CAP_PROP_FPS) or 0.0)
+    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    cap.release()
+    meta = {
+        "total_frames": total,
+        "fps": round(fps, 3),
+        "width": w,
+        "height": h,
+    }
+    if shutil.which("ffprobe"):
+        try:
+            r = subprocess.run(
+                [
+                    "ffprobe", "-v", "quiet", "-select_streams", "v:0",
+                    "-show_entries", "stream=codec_name,bit_rate,pix_fmt,profile",
+                    "-of", "json", video_path,
+                ],
+                capture_output=True, text=True, check=True, timeout=15,
+            )
+            data = json.loads(r.stdout).get("streams", [{}])[0]
+            meta["codec"] = data.get("codec_name")
+            meta["pix_fmt"] = data.get("pix_fmt")
+            meta["profile"] = data.get("profile")
+            meta["bitrate_bps"] = data.get("bit_rate")
+        except Exception as e:
+            meta["ffprobe_error"] = str(e)
+    return meta
+def patch_score_grid(frame_bgr: np.ndarray, patch: int) -> np.ndarray:
+    """Return [hb, wb] grid of Sobel gradient magnitude means per patch."""
+    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY).astype(np.float32)
+    gx = cv2.Sobel(gray, cv2.CV_32F, 1, 0, ksize=3)
+    gy = cv2.Sobel(gray, cv2.CV_32F, 0, 1, ksize=3)
+    mag = np.sqrt(gx * gx + gy * gy)
+    h, w = mag.shape
+    hb, wb = h // patch, w // patch
+    mag = mag[: hb * patch, : wb * patch]
+    grid = mag.reshape(hb, patch, wb, patch).mean(axis=(1, 3))
+    return grid.astype(np.float32)
+def patch_score_frame_diff(
+    prev_bgr: np.ndarray, cur_bgr: np.ndarray, patch: int,
+) -> np.ndarray:
+    """Inter-frame absdiff per patch — proxy for motion / temporal complexity."""
+    if prev_bgr is None or prev_bgr.shape != cur_bgr.shape:
+        return patch_score_grid(cur_bgr, patch)
+    diff = cv2.absdiff(prev_bgr, cur_bgr).mean(axis=2).astype(np.float32)
+    h, w = diff.shape
+    hb, wb = h // patch, w // patch
+    diff = diff[: hb * patch, : wb * patch]
+    return diff.reshape(hb, patch, wb, patch).mean(axis=(1, 3))
+def compute_score_grids(
+    frames: List[np.ndarray], patch: int, signal: str,
+) -> List[np.ndarray]:
+    """Build per-frame patch score grids from one of three signals:
+    - 'gradient'   — Sobel magnitude only (intra-frame complexity)
+    - 'frame_diff' — absdiff vs previous frame (temporal motion)
+    - 'combined'   — 0.5 * gradient_norm + 0.5 * frame_diff_norm
+    For 'combined', each component is independently shifted to [0,1] across
+    the whole sample so they contribute on equal footing."""
+    sig = (signal or "gradient").lower()
+    if sig == "gradient":
+        return [patch_score_grid(f, patch) for f in frames]
+    if sig == "frame_diff":
+        out = []
+        prev = None
+        for f in frames:
+            out.append(patch_score_frame_diff(prev, f, patch))
+            prev = f
+        return out
+    # combined
+    g = np.stack([patch_score_grid(f, patch) for f in frames], axis=0)
+    d_list = []
+    prev = None
+    for f in frames:
+        d_list.append(patch_score_frame_diff(prev, f, patch))
+        prev = f
+    d = np.stack(d_list, axis=0)
+    def _norm01(a: np.ndarray) -> np.ndarray:
+        a = a.astype(np.float32) - a.min()
+        m = a.max()
+        return a / m if m > 1e-8 else a
+    combined = 0.5 * _norm01(g) + 0.5 * _norm01(d)
+    return [combined[i] for i in range(combined.shape[0])]
+def topk_mask(score: np.ndarray, k: int) -> np.ndarray:
+    flat = score.flatten()
+    if k >= flat.size:
+        return np.ones_like(score, dtype=np.uint8)
+    if k <= 0:
+        return np.zeros_like(score, dtype=np.uint8)
+    thresh = np.partition(flat, -k)[-k]
+    return (score >= thresh).astype(np.uint8)
+def faded_background(frame_bgr: np.ndarray, fade: float = 0.55) -> np.ndarray:
+    """Convert to gray-white wash: gray * (1-fade) + white * fade."""
+    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
+    gray_bgr = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR).astype(np.float32)
+    white = np.full_like(gray_bgr, 255.0)
+    out = gray_bgr * (1.0 - fade) + white * fade
+    return out.astype(np.uint8)
+def overlay_selection(
+    frame_bgr: np.ndarray, mask_grid: np.ndarray, patch: int,
+    outline: bool = True, fade: float = 0.55,
+) -> np.ndarray:
+    """Composite: kept patches keep color; dropped patches become gray-white.
+    Optionally draw a thin outline around kept patches."""
+    h, w = frame_bgr.shape[:2]
+    hb, wb = mask_grid.shape
+    pix_mask = np.kron(mask_grid, np.ones((patch, patch), dtype=np.uint8))
+    pix_mask = pix_mask[:h, :w]
+    bg = faded_background(frame_bgr, fade=float(fade))
+    keep = pix_mask.astype(bool)[..., None]
+    out = np.where(keep, frame_bgr, bg)
+    if outline:
+        for i in range(hb):
+            for j in range(wb):
+                if mask_grid[i, j]:
+                    y0, x0 = i * patch, j * patch
+                    cv2.rectangle(
+                        out, (x0, y0), (x0 + patch - 1, y0 + patch - 1),
+                        (0, 220, 255), 1,
+                    )
+    return out
+def _normalize_scores(grids: List[np.ndarray], pct: float = 99.0) -> np.ndarray:
+    """Stack into [N, hb, wb], shift by per-video min, divide by global pct.
+    Using the percentile (instead of max) suppresses outlier patches the same
+    way codec_tools does with bitcost_pct=99."""
+    arr = np.stack(grids, axis=0).astype(np.float32)
+    arr = arr - arr.min()
+    cap = np.percentile(arr, pct) if arr.size else 1.0
+    if cap <= 1e-8:
+        cap = float(arr.max() or 1.0)
+    arr = np.clip(arr / cap, 0.0, 1.0)
+    return arr
+def overlay_heatmap(
+    frame_bgr: np.ndarray, score_grid: np.ndarray, patch: int,
+    alpha: float = 0.55,
+) -> np.ndarray:
+    """Render a continuous JET heatmap of patch scores blended over the frame.
+    Low score = blue, high score = red. `score_grid` is in [0, 1]."""
+    h, w = frame_bgr.shape[:2]
+    score = (np.clip(score_grid, 0.0, 1.0) * 255.0).astype(np.uint8)
+    pix = np.kron(score, np.ones((patch, patch), dtype=np.uint8))
+    pix = pix[:h, :w]
+    heat = cv2.applyColorMap(pix, cv2.COLORMAP_JET)
+    out = cv2.addWeighted(frame_bgr, 1.0 - alpha, heat, alpha, 0.0)
+    return out
+def overlay_sbs(
+    frame_bgr: np.ndarray, mask_grid: np.ndarray, score_grid: np.ndarray,
+    patch: int, alpha: float = 0.55, fade: float = 0.55,
+) -> np.ndarray:
+    """Side-by-side: [selection | heatmap] with a thin separator."""
+    left = overlay_selection(frame_bgr, mask_grid, patch, outline=True, fade=fade)
+    right = overlay_heatmap(frame_bgr, score_grid, patch, alpha=alpha)
+    h, w = left.shape[:2]
+    sep = np.full((h, 4, 3), 30, dtype=np.uint8)
+    sbs = np.concatenate([left, sep, right], axis=1)
+    cv2.putText(sbs, "selection", (8, 22), cv2.FONT_HERSHEY_SIMPLEX,
+                0.6, (255, 255, 255), 2, cv2.LINE_AA)
+    cv2.putText(sbs, "heatmap", (w + 12, 22), cv2.FONT_HERSHEY_SIMPLEX,
+                0.6, (255, 255, 255), 2, cv2.LINE_AA)
+    return sbs
+def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
+    """Write H.264 mp4 via imageio-ffmpeg's bundled ffmpeg (browser-friendly)."""
+    if not frames:
+        raise ValueError("no frames to write")
+    h, w = frames[0].shape[:2]
+    ff = imageio_ffmpeg.get_ffmpeg_exe()
+    cmd = [
+        ff, "-y", "-loglevel", "error",
+        "-f", "rawvideo", "-vcodec", "rawvideo",
+        "-s", f"{w}x{h}", "-pix_fmt", "bgr24",
+        "-r", f"{fps:.3f}", "-i", "-",
+        "-an", "-vcodec", "libx264", "-pix_fmt", "yuv420p",
+        "-preset", "veryfast", "-crf", "23",
+        "-movflags", "+faststart",
+        path,
+    ]
+    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        for f in frames:
+            if f.shape[0] % 2 or f.shape[1] % 2:
+                f = f[: f.shape[0] // 2 * 2, : f.shape[1] // 2 * 2]
+            proc.stdin.write(np.ascontiguousarray(f).tobytes())
+        proc.stdin.close()
+        err = proc.stderr.read().decode("utf-8", errors="ignore")
+        rc = proc.wait()
+        if rc != 0:
+            raise RuntimeError(f"ffmpeg failed (rc={rc}): {err}")
+    finally:
+        if proc.poll() is None:
+            proc.kill()
+def pack_canvas(
+    frames: List[np.ndarray], masks: List[np.ndarray], patch: int,
+) -> Tuple[np.ndarray, int]:
+    """Collect every selected patch in time-order, raster-scan, into a
+    near-square canvas image. Empty slots are white."""
+    selected: List[np.ndarray] = []
+    for f, m in zip(frames, masks):
+        hb, wb = m.shape
+        for i in range(hb):
+            for j in range(wb):
+                if m[i, j]:
+                    selected.append(
+                        f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
+                    )
+    n = len(selected)
+    if n == 0:
+        return np.full((patch, patch, 3), 255, dtype=np.uint8), 0
+    cn = int(math.ceil(math.sqrt(n)))
+    canvas = np.full((cn * patch, cn * patch, 3), 255, dtype=np.uint8)
+    for k, p in enumerate(selected):
+        ci, cj = k // cn, k % cn
+        canvas[ci * patch:(ci + 1) * patch, cj * patch:(cj + 1) * patch] = p
+    return canvas, n
+def process(
+    video_path,
+    sample_frames: int,
+    patch_size: int,
+    top_k_per_frame: int,
+    max_pixels: int,
+    viz_mode: str = "selection",
+    heatmap_alpha: float = 0.55,
+    start_sec: float = 0.0,
+    end_sec: float = 0.0,
+    saliency_signal: str = "gradient",
+    score_log_scale: bool = False,
+    bitcost_pct: float = 99.0,
+    fade_strength: float = 0.55,
+    progress=gr.Progress(track_tqdm=False),
+):
     if not video_path:
+        return None, None, "Please upload a video."
+    t0 = time.time()
+    progress(0.05, desc="Reading metadata")
+    meta = video_metadata(video_path)
+    total = meta.get("total_frames") or 0
+    if total <= 0:
+        return None, None, json.dumps(
+            {"error": "Could not read frame count.", "metadata": meta},
+            indent=2, ensure_ascii=False,
         )
+    progress(0.10, desc="Sampling frames")
+    fps = float(meta.get("fps") or 0.0)
+    s_sec = max(0.0, float(start_sec or 0.0))
+    e_sec = float(end_sec or 0.0)
+    if fps > 0 and (s_sec > 0 or e_sec > 0):
+        f_start = max(0, int(round(s_sec * fps)))
+        f_end = (
+            min(total - 1, int(round(e_sec * fps)) - 1)
+            if e_sec > 0 else total - 1
         )
+        if f_end <= f_start:
+            f_end = total - 1
+        window_total = f_end - f_start + 1
+        if int(sample_frames) >= window_total:
+            fids = list(range(f_start, f_end + 1))
+        else:
+            fids = [
+                int(round(x))
+                for x in np.linspace(f_start, f_end, int(sample_frames))
+            ]
+    else:
+        f_start, f_end = 0, total - 1
+        fids = sample_frame_ids(total, int(sample_frames))
+    raw = decode_frames(video_path, fids)
+    if not raw:
+        return None, None, json.dumps(
+            {"error": "Failed to decode frames.", "metadata": meta},
+            indent=2, ensure_ascii=False,
+        )
+    progress(0.25, desc="smart_resize")
+    resized = [smart_resize(f, int(max_pixels), int(patch_size)) for f in raw]
+    th, tw = resized[0].shape[:2]
+    resized = [
+        cv2.resize(f, (tw, th), interpolation=cv2.INTER_AREA)
+        if f.shape[:2] != (th, tw) else f
+        for f in resized
+    ]
+    progress(0.40, desc=f"Scoring patches ({saliency_signal})")
+    grids = compute_score_grids(resized, int(patch_size), saliency_signal)
+    if score_log_scale:
+        grids = [np.log1p(np.clip(g, 0.0, None)) for g in grids]
+    masks = [topk_mask(g, int(top_k_per_frame)) for g in grids]
+    norm_scores = _normalize_scores(grids, pct=float(bitcost_pct))
+    mode = (viz_mode or "selection").lower()
+    if mode not in ("selection", "heatmap", "sbs"):
+        mode = "selection"
+    progress(0.60, desc=f"Rendering {mode} video")
+    if mode == "heatmap":
+        vis = [
+            overlay_heatmap(f, s, int(patch_size), alpha=float(heatmap_alpha))
+            for f, s in zip(resized, norm_scores)
+        ]
+    elif mode == "sbs":
+        vis = [
+            overlay_sbs(
+                f, m, s, int(patch_size),
+                alpha=float(heatmap_alpha), fade=float(fade_strength),
+            )
+            for f, m, s in zip(resized, masks, norm_scores)
+        ]
+    else:
+        vis = [
+            overlay_selection(f, m, int(patch_size), fade=float(fade_strength))
+            for f, m in zip(resized, masks)
+        ]
+    out_dir = tempfile.mkdtemp(prefix="codec_view_")
+    vis_path = os.path.join(out_dir, f"{mode}_vis.mp4")
+    vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
+    write_mp4(vis, vis_path, vis_fps)
+    progress(0.85, desc="Packing canvas")
+    canvas, n_selected = pack_canvas(resized, masks, int(patch_size))
+    canvas_path = os.path.join(out_dir, "canvas.png")
+    cv2.imwrite(canvas_path, canvas)
+    hb, wb = grids[0].shape
+    info = {
+        "input": meta,
+        "params": {
+            "sample_frames": int(sample_frames),
+            "patch_size": int(patch_size),
+            "top_k_per_frame": int(top_k_per_frame),
+            "max_pixels": int(max_pixels),
+            "start_sec": float(s_sec),
+            "end_sec": float(e_sec) if e_sec > 0 else None,
+            "saliency_signal": saliency_signal,
+            "score_log_scale": bool(score_log_scale),
+            "bitcost_pct": float(bitcost_pct),
+            "fade_strength": float(fade_strength),
         },
+        "frame_window": {
+            "first_decoded": int(f_start),
+            "last_decoded": int(f_end),
+            "actual_frame_ids": [int(x) for x in fids],
         },
+        "resized_frame_size": f"{tw}x{th}",
+        "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
+        "selected_per_frame": int(min(top_k_per_frame, hb * wb)),
+        "total_selected_patches": int(n_selected),
+        "canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
+        "vis_video_fps": round(vis_fps, 2),
+        "viz_mode": mode,
+        "heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
+        "score_normalization": f"shift-min, /p{bitcost_pct:.1f}, clip"
+        + (" (log1p applied)" if score_log_scale else ""),
+        "elapsed_sec": round(time.time() - t0, 2),
     }
+    progress(1.0, desc="Done")
+    return vis_path, canvas_path, json.dumps(info, indent=2, ensure_ascii=False)
+CUSTOM_CSS = """
+:root, .gradio-container, .gradio-container.dark {
+    --ovc-grad: linear-gradient(135deg, #4f46e5 0%, #2563eb 50%, #06b6d4 100%);
+}
+.gradio-container { max-width: 1280px !important; margin: 0 auto !important; }
+#ovc-hero {
+    text-align: center;
+    padding: 28px 16px 8px;
+    border-radius: 16px;
+    background: linear-gradient(180deg, rgba(79,70,229,0.08), rgba(6,182,212,0.04));
+    margin-bottom: 8px;
+}
+#ovc-hero h1 {
+    font-size: 2.1rem;
+    font-weight: 700;
+    background: var(--ovc-grad);
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent;
+    margin: 0 0 6px;
+    letter-spacing: -0.02em;
+}
+#ovc-hero p.tagline {
+    font-size: 1.02rem;
+    color: var(--body-text-color-subdued);
+    margin: 0 auto 12px;
+    max-width: 720px;
+    line-height: 1.55;
+}
+#ovc-hero .pills { display:flex; flex-wrap:wrap; gap:6px; justify-content:center; margin-top:6px; }
+#ovc-hero .pill {
+    font-size: 0.78rem;
+    font-weight: 600;
+    padding: 4px 10px;
+    border-radius: 999px;
+    color: #fff;
+    background: var(--ovc-grad);
+    opacity: 0.92;
+}
+.ovc-card {
+    border-radius: 14px !important;
+    padding: 14px 16px !important;
+    border: 1px solid var(--border-color-primary) !important;
+    background: var(--background-fill-primary) !important;
+    box-shadow: 0 1px 2px rgba(0,0,0,0.04);
+}
+.ovc-card h3 {
+    font-size: 0.86rem !important;
+    font-weight: 700 !important;
+    text-transform: uppercase;
+    letter-spacing: 0.06em;
+    color: var(--body-text-color-subdued) !important;
+    margin: 0 0 8px !important;
+}
+#ovc-run button {
+    width: 100%;
+    height: 48px !important;
+    font-size: 1.02rem !important;
+    font-weight: 600 !important;
+    background: var(--ovc-grad) !important;
+    border: none !important;
+    color: #fff !important;
+    border-radius: 12px !important;
+    box-shadow: 0 4px 14px rgba(37, 99, 235, 0.35);
+    transition: transform 0.05s ease;
+}
+#ovc-run button:hover { transform: translateY(-1px); }
+#ovc-run button:active { transform: translateY(0); }
+#ovc-footer {
+    text-align: center;
+    color: var(--body-text-color-subdued);
+    font-size: 0.78rem;
+    padding: 18px 8px 8px;
+    margin-top: 10px;
+}
+"""
+THEME = gr.themes.Soft(
+    primary_hue="indigo",
+    secondary_hue="blue",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
+).set(
+    body_background_fill="*neutral_50",
+    block_radius="14px",
+    button_primary_background_fill="*primary_500",
+    button_primary_background_fill_hover="*primary_600",
+)
+HERO_HTML = """
+<div id="ovc-hero">
+  <h1>OneVision Encoder Codec View</h1>
+  <p class="tagline">
+    Visualize which patches a codec-style saliency picks from your video,
+    then pack them into the canvas LLaVA-OneVision2 consumes.
+    Use it to inspect <i>where</i> the model is actually looking.
+  </p>
+  <div class="pills">
+    <span class="pill">selection · heatmap · sbs</span>
+    <span class="pill">gradient + motion signals</span>
+    <span class="pill">canvas export</span>
+  </div>
+</div>
+"""
+with gr.Blocks(title="OneVision Encoder Codec View", theme=THEME, css=CUSTOM_CSS) as demo:
+    gr.HTML(HERO_HTML)
+    with gr.Row(equal_height=False):
+        # ─── Controls (narrow column) ────────────────────────────────────
+        with gr.Column(scale=4, min_width=320):
+            with gr.Group(elem_classes="ovc-card"):
+                gr.Markdown("### Input")
+                video_in = gr.Video(label="Video", sources=["upload"], height=240)
+            with gr.Group(elem_classes="ovc-card"):
+                gr.Markdown("### Pipeline")
+                viz_mode = gr.Radio(
+                    ["selection", "heatmap", "sbs"], value="selection",
+                    label="Visualization mode",
+                )
+                sample_frames = gr.Slider(
+                    4, 64, value=16, step=1, label="Sampled frames",
+                )
+                top_k = gr.Slider(
+                    4, 1024, value=64, step=4, label="Top-K patches per frame",
+                )
+                patch_size = gr.Radio(
+                    PATCH_CHOICES, value=14, label="Patch size (px)",
+                )
+            with gr.Accordion("Time window", open=False):
+                with gr.Row():
+                    start_sec = gr.Number(value=0.0, precision=2, label="Start (s)")
+                    end_sec = gr.Number(value=0.0, precision=2, label="End (s)")
+                gr.Markdown(
+                    "<small>Set both to 0 to use the full video.</small>",
+                )
+            with gr.Accordion("Saliency", open=False):
+                saliency_signal = gr.Radio(
+                    ["gradient", "frame_diff", "combined"], value="gradient",
+                    label="Scoring signal",
+                    info="gradient = intra-frame Sobel · "
+                         "frame_diff = inter-frame motion · "
+                         "combined = 0.5 each.",
+                )
+                score_log_scale = gr.Checkbox(
+                    value=False, label="Apply log1p to scores",
+                )
+                bitcost_pct = gr.Slider(
+                    80.0, 99.9, value=99.0, step=0.1,
+                    label="Heatmap normalization percentile",
+                )
+            with gr.Accordion("Visual style", open=False):
+                heatmap_alpha = gr.Slider(
+                    0.1, 0.9, value=0.55, step=0.05,
+                    label="Heatmap blend α",
+                )
+                fade_strength = gr.Slider(
+                    0.0, 0.9, value=0.55, step=0.05,
+                    label="Selection fade strength",
+                )
+                max_pixels = gr.Slider(
+                    40000, 400000, value=150000, step=10000,
+                    label="Max pixels per frame",
+                )
+            with gr.Row(elem_id="ovc-run"):
+                run_btn = gr.Button("Run pipeline", variant="primary")
+        # ─── Outputs (wide column) ───────────────────────────────────────
+        with gr.Column(scale=6, min_width=420):
+            with gr.Group(elem_classes="ovc-card"):
+                gr.Markdown("### Patch selection visualization")
+                vis_out = gr.Video(
+                    label="", show_label=False, autoplay=True, height=420,
+                )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="ovc-card"):
+                        gr.Markdown("### Packed canvas")
+                        canvas_out = gr.Image(
+                            label="", show_label=False, show_download_button=True,
+                            height=320,
+                        )
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="ovc-card"):
+                        gr.Markdown("### Run info")
+                        info_out = gr.Code(
+                            label="", language="json", lines=14,
+                        )
+    gr.HTML(
+        '<div id="ovc-footer">'
+        'Approximation of the bitcost-driven patch selection in '
+        '<code>codec_tools/</code> · Sobel + frame-diff used as a stand-in '
+        'for the ffmpeg bitcost patch.'
+        '</div>'
+    )
+    run_btn.click(
+        process,
+        inputs=[
+            video_in, sample_frames, patch_size, top_k, max_pixels,
+            viz_mode, heatmap_alpha,
+            start_sec, end_sec,
+            saliency_signal, score_log_scale, bitcost_pct, fade_strength,
+        ],
+        outputs=[vis_out, canvas_out, info_out],
     )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+opencv-python-headless>=4.8
+numpy>=1.24
+imageio>=2.34
+imageio-ffmpeg>=0.5
+Pillow>=10.0