Spaces:

JackIsNotInTheBox
/

watermark_remover

Paused

App Files Files Community

Jack Wu commited on 11 days ago

Commit

ebe8a5c

1 Parent(s): 4f8616d

feat: introduce checkpoint mirroring script, strengthen video validation, and improve pipeline robustness for masking and compositing.

Browse files

Files changed (7) hide show

app.py +45 -3
pipeline/composite.py +9 -1
pipeline/crop.py +18 -5
pipeline/lama.py +8 -2
pipeline/video.py +41 -6
requirements.txt +1 -1
scripts/mirror_checkpoints.py +138 -0

app.py CHANGED Viewed

@@ -265,7 +265,25 @@ def _meta_to_dict(meta: VideoMeta) -> dict:
 def _dict_to_meta(d: dict) -> VideoMeta:
-    return VideoMeta(**d)
 # ---------------------------------------------------------------------------
@@ -281,13 +299,23 @@ def on_video_upload(video_path: str | None):
         meta = probe(video_path)
         # ── Input validation — guard against disk exhaustion on ZeroGPU ──
-        MAX_DURATION_S = 16.0
         MAX_PIXELS = 1920 * 1080
         if meta.duration_s > MAX_DURATION_S:
             return (
                 gr.update(), gr.update(), None,
                 f"❌ Clip too long ({meta.duration_s:.1f}s). Max {MAX_DURATION_S:.0f} seconds.",
             )
         if meta.width * meta.height > MAX_PIXELS:
             return (
                 gr.update(), gr.update(), None,
@@ -481,7 +509,11 @@ def run_pipeline(
     )
     with VideoWorkspace() as ws:
-        safe_video = ws.path("source" + Path(video_path).suffix)
         shutil.copy2(video_path, safe_video)
         # ── Extract frames (CFR-forced for VFR safety) ─────────────────
@@ -490,6 +522,16 @@ def run_pipeline(
         total = len(frame_paths)
         # ── GPU: inpaint + composite + save ────────────────────────────
         progress(0.15, desc="Starting inpainting…")
         _inpaint_composite_save_gpu(
             frame_paths, crop_region, inpaint_mask,

 def _dict_to_meta(d: dict) -> VideoMeta:
+    """Reconstruct a VideoMeta from a dict, ignoring unknown keys.
+    The dict lives in Gradio ``State`` and may contain extra fields if a
+    user has a cached session from an older or newer version of the app.
+    Passing ``**d`` directly would raise ``TypeError`` on unexpected keys.
+    """
+    return VideoMeta(
+        width=d["width"],
+        height=d["height"],
+        fps=d["fps"],
+        frame_count=d["frame_count"],
+        duration_s=d["duration_s"],
+        color_primaries=d.get("color_primaries"),
+        color_trc=d.get("color_trc"),
+        colorspace=d.get("colorspace"),
+        color_range=d.get("color_range"),
+        codec_name=d.get("codec_name", "unknown"),
+        bit_depth=d.get("bit_depth", 8),
+    )
 # ---------------------------------------------------------------------------
         meta = probe(video_path)
         # ── Input validation — guard against disk exhaustion on ZeroGPU ──
+        MAX_DURATION_S = 15.0
         MAX_PIXELS = 1920 * 1080
+        # Max frames caps videos where ffprobe returns N/A for duration
+        # (VFR/container-less formats). duration_s would be 0.0 after our guard,
+        # so the duration check alone would pass an arbitrarily long clip.
+        MAX_FRAMES = round(MAX_DURATION_S * max(meta.fps, 1.0))
         if meta.duration_s > MAX_DURATION_S:
             return (
                 gr.update(), gr.update(), None,
                 f"❌ Clip too long ({meta.duration_s:.1f}s). Max {MAX_DURATION_S:.0f} seconds.",
             )
+        if meta.frame_count > MAX_FRAMES:
+            return (
+                gr.update(), gr.update(), None,
+                f"❌ Clip too long ({meta.frame_count} frames at {meta.fps:.2f} fps). "
+                f"Max {MAX_DURATION_S:.0f} seconds.",
+            )
         if meta.width * meta.height > MAX_PIXELS:
             return (
                 gr.update(), gr.update(), None,
     )
     with VideoWorkspace() as ws:
+        # Preserve the original file extension so FFmpeg can detect the container
+        # format. Gradio always adds an extension for video uploads, but fall back
+        # to .mp4 if the path somehow has none.
+        src_suffix = Path(video_path).suffix or ".mp4"
+        safe_video = ws.path("source" + src_suffix)
         shutil.copy2(video_path, safe_video)
         # ── Extract frames (CFR-forced for VFR safety) ─────────────────
         total = len(frame_paths)
         # ── GPU: inpaint + composite + save ────────────────────────────
+        # Validate mode on CPU before acquiring GPU so unimplemented modes
+        # fail fast without burning ZeroGPU quota.
+        _VALID_MODES = ("Fast (LaMa)", "Quality (VACE-14B)")
+        if mode not in _VALID_MODES:
+            raise gr.Error(f"Unknown mode '{mode}'. Choose from: {_VALID_MODES}")
+        if mode == "Quality (VACE-14B)":
+            raise gr.Error(
+                "VACE-14B quality mode is not yet available. "
+                "Please select Fast (LaMa)."
+            )
         progress(0.15, desc="Starting inpainting…")
         _inpaint_composite_save_gpu(
             frame_paths, crop_region, inpaint_mask,

pipeline/composite.py CHANGED Viewed

@@ -184,11 +184,19 @@ def composite_frames(
         Composited full-frame images (uint8 RGB), one per input frame.
     """
     alpha = feathered_alpha(inpaint_mask, feather_radius)
     return [
         composite_with_alpha(
             np.array(Image.open(fp).convert("RGB")), crop, crop_region, alpha
         )
-        for fp, crop in zip(original_frame_paths, inpainted_crops)
     ]

         Composited full-frame images (uint8 RGB), one per input frame.
     """
     alpha = feathered_alpha(inpaint_mask, feather_radius)
+    # Materialise to list so a generator argument isn't consumed by len()
+    # before the zip iteration below.
+    frame_paths = list(original_frame_paths)
+    if len(frame_paths) != len(inpainted_crops):
+        raise ValueError(
+            f"composite_frames: {len(frame_paths)} frame paths but "
+            f"{len(inpainted_crops)} crops — lengths must match."
+        )
     return [
         composite_with_alpha(
             np.array(Image.open(fp).convert("RGB")), crop, crop_region, alpha
         )
+        for fp, crop in zip(frame_paths, inpainted_crops)
     ]

pipeline/crop.py CHANGED Viewed

@@ -169,6 +169,7 @@ def mask_to_bbox(mask: np.ndarray) -> BBox:
     ----------
     mask : np.ndarray
         Single-channel mask, dtype uint8. Non-zero pixels = drawn area.
     Returns
     -------
@@ -178,8 +179,13 @@ def mask_to_bbox(mask: np.ndarray) -> BBox:
     Raises
     ------
     ValueError
-        If the mask contains no drawn pixels.
     """
     if mask.ndim == 3:
         # Accept RGB/RGBA — collapse to single channel
         mask = mask.max(axis=2)
@@ -318,9 +324,10 @@ def compute_crop_region(
     target_w = min(target_w, frame_w)
     target_h = min(target_h, frame_h)
-    # Round down to multiple of 32 after clamping
-    target_w = _floor_to_multiple(target_w, 32)
-    target_h = _floor_to_multiple(target_h, 32)
     # ------------------------------------------------------------------
     # 3. Centre on watermark centroid, then clamp to frame bounds
@@ -394,7 +401,13 @@ def build_inpaint_mask(
         y2 = crop_region.frame_y + crop_region.frame_h
         x1 = crop_region.frame_x
         x2 = crop_region.frame_x + crop_region.frame_w
-        mask = source_mask[y1:y2, x1:x2].copy()
     else:
         # Fill the watermark bbox rectangle
         b = crop_region.mask_bbox

     ----------
     mask : np.ndarray
         Single-channel mask, dtype uint8. Non-zero pixels = drawn area.
+        2D (H×W) or 3D (H×W×C) arrays are accepted; 4D+ is rejected.
     Returns
     -------
     Raises
     ------
     ValueError
+        If the mask contains no drawn pixels, or is 4-dimensional or higher.
     """
+    if mask.ndim > 3:
+        raise ValueError(
+            f"mask_to_bbox: expected a 2D or 3D mask array, got ndim={mask.ndim}. "
+            "Pass a single-frame H×W or H×W×C numpy array."
+        )
     if mask.ndim == 3:
         # Accept RGB/RGBA — collapse to single channel
         mask = mask.max(axis=2)
     target_w = min(target_w, frame_w)
     target_h = min(target_h, frame_h)
+    # Round down to multiple of 32 after clamping.
+    # max(..., 32) ensures we never produce a 0-dim crop for very small frames.
+    target_w = max(_floor_to_multiple(target_w, 32), 32)
+    target_h = max(_floor_to_multiple(target_h, 32), 32)
     # ------------------------------------------------------------------
     # 3. Centre on watermark centroid, then clamp to frame bounds
         y2 = crop_region.frame_y + crop_region.frame_h
         x1 = crop_region.frame_x
         x2 = crop_region.frame_x + crop_region.frame_w
+        cropped = source_mask[y1:y2, x1:x2]
+        # Binarise: any non-zero value (including raw Gradio layer values
+        # like 200 that are not exactly 255) becomes 255.
+        binarised = (cropped > 0).astype(np.uint8) * 255
+        # Guard: source_mask smaller than crop region causes cropped to be
+        # smaller than (frame_h, frame_w). Copy into a zero-padded full mask.
+        mask[: binarised.shape[0], : binarised.shape[1]] = binarised
     else:
         # Fill the watermark bbox rectangle
         b = crop_region.mask_bbox

pipeline/lama.py CHANGED Viewed

@@ -149,5 +149,11 @@ def _load_crop(frame_path: Path, crop_region: CropRegion) -> np.ndarray:
 def _mask_to_pil(mask: np.ndarray) -> Image.Image:
-    """Convert a uint8 numpy mask to a PIL L-mode image for LaMa."""
-    return Image.fromarray(mask, mode="L")

 def _mask_to_pil(mask: np.ndarray) -> Image.Image:
+    """Convert a 2D uint8 numpy mask to a PIL L-mode image for LaMa.
+    ``Image.fromarray`` on a 2D uint8 array automatically produces mode ``'L'``
+    (the format LaMa / simple-lama-inpainting expects for the mask input).
+    The explicit ``mode="L"`` argument is omitted to avoid the Pillow 13
+    deprecation warning for the type-coercion overload of that parameter.
+    """
+    return Image.fromarray(mask)  # 2D uint8 → 'L' automatically

pipeline/video.py CHANGED Viewed

@@ -91,7 +91,12 @@ def probe(video_path: str | Path) -> VideoMeta:
     # Duration: prefer stream-level, fall back to format-level
     dur_str = video_stream.get("duration") or data.get("format", {}).get("duration", "0")
-    duration_s = float(dur_str)
     # Frame count
     nb_frames = video_stream.get("nb_frames")
@@ -114,9 +119,17 @@ def probe(video_path: str | Path) -> VideoMeta:
     except (ValueError, TypeError):
         bit_depth = _bd_from_pix_fmt()
     return VideoMeta(
-        width=int(video_stream["width"]),
-        height=int(video_stream["height"]),
         fps=fps,
         frame_count=frame_count,
         duration_s=duration_s,
@@ -181,7 +194,12 @@ def extract_frames(
     cmd.append(str(out_dir / pattern))
     _run(cmd)
-    frames = sorted(out_dir.glob("*.png"), key=lambda p: int(p.stem))
     if not frames:
         raise RuntimeError(f"No frames extracted from {video_path} into {out_dir}")
     return frames
@@ -282,14 +300,22 @@ def frames_to_video(
         vid_codec, pix_fmt = "libx264", "yuv420p"
         extra_codec_flags = []
     cmd = [
         "ffmpeg",
         "-y",
         "-framerate", _fps_str(meta.fps),
         "-i", str(Path(frames_dir) / pattern),
         "-c:v", vid_codec,
         "-preset", "slow",
         "-crf", str(crf),
         "-pix_fmt", pix_fmt,
         *extra_codec_flags,
         *color_flags,
@@ -377,7 +403,10 @@ class VideoWorkspace:
     def __exit__(self, *args) -> None:
         if self._tmpdir:
-            self._tmpdir.cleanup()
     def path(self, name: str) -> Path:
         """Return a path inside the workspace root."""
@@ -394,6 +423,8 @@ def _run(cmd: list[str]) -> subprocess.CompletedProcess:
         cmd,
         capture_output=True,
         text=True,
     )
     if result.returncode != 0:
         raise RuntimeError(
@@ -415,6 +446,10 @@ def _parse_rational(rat: str) -> float:
 def _fps_str(fps: float) -> str:
     """Convert fps float to a clean string for FFmpeg -framerate."""
     # Keep common exact fractions (24000/1001, 30000/1001, etc.)
     common = {
         23.976:  "24000/1001",
@@ -424,7 +459,7 @@ def _fps_str(fps: float) -> str:
         119.88:  "120000/1001",  # 120p (S1II high-frame-rate mode)
     }
     for approx, s in common.items():
-        if abs(fps - approx) < 0.01:
             return s
     return f"{fps:.6g}"

     # Duration: prefer stream-level, fall back to format-level
     dur_str = video_stream.get("duration") or data.get("format", {}).get("duration", "0")
+    try:
+        duration_s = float(dur_str)
+    except (ValueError, TypeError):
+        # ffprobe emits "N/A" for duration on VFR / container-less formats;
+        # fall back to 0.0 — frame_count will still be set from nb_frames.
+        duration_s = 0.0
     # Frame count
     nb_frames = video_stream.get("nb_frames")
     except (ValueError, TypeError):
         bit_depth = _bd_from_pix_fmt()
+    width = video_stream.get("width")
+    height = video_stream.get("height")
+    if not width or not height:
+        raise RuntimeError(
+            f"Video stream in {video_path} has no width/height — "
+            "the file may be corrupt or contain only audio."
+        )
     return VideoMeta(
+        width=int(width),
+        height=int(height),
         fps=fps,
         frame_count=frame_count,
         duration_s=duration_s,
     cmd.append(str(out_dir / pattern))
     _run(cmd)
+    # Filter to sequentially named PNGs only (FFmpeg writes purely numeric
+    # names, but a failed run might leave a non-numeric file behind).
+    frames = sorted(
+        (p for p in out_dir.glob("*.png") if p.stem.isdigit()),
+        key=lambda p: int(p.stem),
+    )
     if not frames:
         raise RuntimeError(f"No frames extracted from {video_path} into {out_dir}")
     return frames
         vid_codec, pix_fmt = "libx264", "yuv420p"
         extra_codec_flags = []
+    # Build even-dimension filter.
+    # yuv420p / yuv420p10le require both width and height to be divisible by 2.
+    # The source video can have odd dimensions (some encoders emit 1919×1079 etc.).
+    # scale=trunc(iw/2)*2:trunc(ih/2)*2 is the standard FFmpeg idiom for this.
+    even_filter = "scale=trunc(iw/2)*2:trunc(ih/2)*2"
     cmd = [
         "ffmpeg",
         "-y",
         "-framerate", _fps_str(meta.fps),
+        "-start_number", "1",           # explicit: frame files start at 000001.png
         "-i", str(Path(frames_dir) / pattern),
         "-c:v", vid_codec,
         "-preset", "slow",
         "-crf", str(crf),
+        "-vf", even_filter,
         "-pix_fmt", pix_fmt,
         *extra_codec_flags,
         *color_flags,
     def __exit__(self, *args) -> None:
         if self._tmpdir:
+            try:
+                self._tmpdir.cleanup()
+            except Exception:  # PermissionError on Windows/macOS antivirus lock
+                pass  # Best-effort cleanup; the OS will reclaim the temp dir
     def path(self, name: str) -> Path:
         """Return a path inside the workspace root."""
         cmd,
         capture_output=True,
         text=True,
+        encoding="utf-8",
+        errors="replace",  # Non-UTF-8 chars in FFmpeg stderr replaced with �
     )
     if result.returncode != 0:
         raise RuntimeError(
 def _fps_str(fps: float) -> str:
     """Convert fps float to a clean string for FFmpeg -framerate."""
+    if fps <= 0:
+        # Should never happen with valid ffprobe output, but guard against
+        # corrupt metadata producing -framerate 0 which makes FFmpeg error.
+        return "30"
     # Keep common exact fractions (24000/1001, 30000/1001, etc.)
     common = {
         23.976:  "24000/1001",
         119.88:  "120000/1001",  # 120p (S1II high-frame-rate mode)
     }
     for approx, s in common.items():
+        if abs(fps - approx) < 0.015:
             return s
     return f"{fps:.6g}"

requirements.txt CHANGED Viewed

@@ -8,7 +8,7 @@
 gradio>=4.44.0,<5.0.0
 numpy>=1.24.0
 Pillow>=10.0.0
-scipy>=1.11.0        # mask dilation in pipeline/crop.py
 # ── Fast mode (LaMa) ──────────────────────────────────────────────────────
 simple-lama-inpainting>=0.1.2

 gradio>=4.44.0,<5.0.0
 numpy>=1.24.0
 Pillow>=10.0.0
+scipy>=1.11.0        # mask dilation (pipeline/crop.py) + feather blur (pipeline/composite.py)
 # ── Fast mode (LaMa) ──────────────────────────────────────────────────────
 simple-lama-inpainting>=0.1.2

scripts/mirror_checkpoints.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "huggingface_hub>=0.26",
+#   "requests>=2.31",
+# ]
+# ///
+"""
+mirror_checkpoints.py
+---------------------
+One-off mirror job: copies the three model dependencies for the
+Video Watermark Remover Space into JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints
+so the Space is insulated from upstream deletion.
+Sources mirrored:
+  1. Wan-AI/Wan2.1-VACE-14B-diffusers     (~75 GB, Apache-2.0) → vace-14b/
+  2. lightx2v/Wan2.1-Distill-Loras        (single LoRA file)   → loras/
+  3. big-lama.pt from GitHub releases     (~196 MB, Apache-2.0) → lama/
+Strategy
+--------
+Per-file streaming: download → upload → delete. Disk usage at any moment
+is ~one file (max ~5 GB for a single VACE transformer shard), so this fits
+on cpu-basic / cpu-upgrade Jobs without ever holding the full 75 GB locally.
+"""
+import os
+import sys
+from pathlib import Path
+import requests
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+DEST_REPO = "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints"
+TOKEN = os.environ.get("HF_TOKEN")
+if not TOKEN:
+    sys.exit("HF_TOKEN env var not set; pass via `--secrets HF_TOKEN=...`")
+WORK = Path("/tmp/mirror")
+WORK.mkdir(parents=True, exist_ok=True)
+api = HfApi(token=TOKEN)
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def stream_repo(
+    src_repo: str,
+    dest_prefix: str,
+    src_type: str = "model",
+    exclude_globs: list[str] | None = None,
+) -> None:
+    """Mirror every file in src_repo under dest_prefix in DEST_REPO."""
+    files = list_repo_files(src_repo, repo_type=src_type, token=TOKEN)
+    exclude = exclude_globs or []
+    files = [f for f in files if not any(Path(f).match(g) for g in exclude)]
+    print(f"\n=== {src_repo} → {dest_prefix}/ ({len(files)} files) ===", flush=True)
+    for i, fname in enumerate(files, 1):
+        print(f"  [{i:>3}/{len(files)}] {fname}", flush=True)
+        local = hf_hub_download(
+            repo_id=src_repo,
+            repo_type=src_type,
+            filename=fname,
+            local_dir=str(WORK),
+            token=TOKEN,
+        )
+        api.upload_file(
+            path_or_fileobj=local,
+            path_in_repo=f"{dest_prefix}/{fname}",
+            repo_id=DEST_REPO,
+            repo_type="model",
+            commit_message=f"Mirror {src_repo}: {fname}",
+        )
+        Path(local).unlink(missing_ok=True)
+def stream_url(url: str, dest_path_in_repo: str, commit_message: str) -> None:
+    """Download a single file from an arbitrary URL, push to DEST_REPO, delete."""
+    fname = Path(dest_path_in_repo).name
+    print(f"\n=== {url} → {dest_path_in_repo} ===", flush=True)
+    local = WORK / fname
+    with requests.get(url, stream=True, timeout=300) as r:
+        r.raise_for_status()
+        with open(local, "wb") as fp:
+            for chunk in r.iter_content(chunk_size=1 << 20):  # 1 MB chunks
+                fp.write(chunk)
+    api.upload_file(
+        path_or_fileobj=str(local),
+        path_in_repo=dest_path_in_repo,
+        repo_id=DEST_REPO,
+        repo_type="model",
+        commit_message=commit_message,
+    )
+    local.unlink(missing_ok=True)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    # 1. VACE-14B (largest — do first while disk is freshest)
+    stream_repo(
+        "Wan-AI/Wan2.1-VACE-14B-diffusers",
+        dest_prefix="vace-14b",
+        exclude_globs=["assets/*", ".gitattributes"],
+    )
+    # 2. 4-step distill LoRA (single file)
+    stream_repo(
+        "lightx2v/Wan2.1-Distill-Loras",
+        dest_prefix="loras",
+        exclude_globs=[
+            "*.md",
+            ".gitattributes",
+            # Pull only the rank-64 t2v 4-step LoRA — matches vace.py 8-step plan
+            "*i2v*",
+            "*rank32*",
+            "*rank128*",
+        ],
+    )
+    # 3. LaMa from GitHub release
+    stream_url(
+        url="https://github.com/enesmsahin/simple-lama-inpainting/releases/download/v0.1.0/big-lama.pt",
+        dest_path_in_repo="lama/big-lama.pt",
+        commit_message="Mirror big-lama.pt from simple-lama-inpainting v0.1.0 GitHub release",
+    )
+    print("\n✅ All mirrors complete.")
+if __name__ == "__main__":
+    main()