Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 26

Commit

2586f05

1 Parent(s): a8a12b2

Create utils/mask_bridge.py

Browse files

Files changed (1) hide show

utils/mask_bridge.py +60 -0

utils/mask_bridge.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+"""
+mask_bridge.py - SAM2 to MatAnyone mask conversion
+Handles shape/dtype/device normalization between models
+"""
+import torch
+import math
+from typing import Optional, Tuple
+def log_shape(tag: str, t: torch.Tensor) -> None:
+    """Debug logging for tensor shapes and values"""
+    mn = float(t.min()) if t.numel() else math.nan
+    mx = float(t.max()) if t.numel() else math.nan
+    print(f"{tag}: shape={tuple(t.shape)} dtype={t.dtype} device={t.device} range=[{mn:.3f},{mx:.3f}]")
+def sam2_to_matanyone_mask(
+    sam2_masks: torch.Tensor,         # shape: (B, M, H, W) from SAM2 post_process_masks
+    iou_scores: Optional[torch.Tensor] = None,  # optional, (B, M)
+    threshold: float = 0.5,           # binarization for hard mask
+    return_mode: str = "single",      # "single" → (1,H,W); "multi" → (C,H,W)
+    keep_soft: bool = False,          # if True → soft [0,1] alpha channel
+) -> torch.Tensor:
+    """
+    Convert SAM2 output masks to MatAnyone-ready format.
+    Returns a MatAnyone-ready tensor on the same device:
+      - "single": (1,H,W) float32 in [0,1]
+      - "multi":  (C,H,W) float32 in [0,1]
+    """
+    assert sam2_masks.ndim == 4, f"Expect (B,M,H,W). Got {tuple(sam2_masks.shape)}"
+    B, M, H, W = sam2_masks.shape
+    assert B == 1, "We pass one frame to build first-frame mask."
+    masks = sam2_masks[0]  # (M,H,W)
+    # Choose best mask
+    if iou_scores is not None and iou_scores.ndim == 2:
+        best_idx = int(torch.argmax(iou_scores[0]).item())
+    else:
+        # Fallback: pick the mask with largest foreground area
+        areas = masks.sum(dim=(1,2))
+        best_idx = int(torch.argmax(areas).item())
+    if return_mode == "multi":
+        out = masks
+    else:
+        out = masks[best_idx:best_idx+1]  # (1,H,W)
+    # Ensure float32 [0,1]
+    out = out.to(dtype=torch.float32)
+    if not keep_soft:
+        out = (out >= threshold).float()
+    # Final sanity: contiguous, shapes
+    out = out.contiguous()
+    assert out.ndim == 3, f"Expect (C,H,W); got {tuple(out.shape)}"
+    assert out.shape[0] >= 1, f"Need at least 1 channel; got {out.shape[0]}"
+    return out  # (C,H,W) float32 [0,1]