Create utils/mask_bridge.py
Browse files- utils/mask_bridge.py +60 -0
utils/mask_bridge.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
mask_bridge.py - SAM2 to MatAnyone mask conversion
|
| 4 |
+
Handles shape/dtype/device normalization between models
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import math
|
| 9 |
+
from typing import Optional, Tuple
|
| 10 |
+
|
| 11 |
+
def log_shape(tag: str, t: torch.Tensor) -> None:
|
| 12 |
+
"""Debug logging for tensor shapes and values"""
|
| 13 |
+
mn = float(t.min()) if t.numel() else math.nan
|
| 14 |
+
mx = float(t.max()) if t.numel() else math.nan
|
| 15 |
+
print(f"{tag}: shape={tuple(t.shape)} dtype={t.dtype} device={t.device} range=[{mn:.3f},{mx:.3f}]")
|
| 16 |
+
|
| 17 |
+
def sam2_to_matanyone_mask(
|
| 18 |
+
sam2_masks: torch.Tensor, # shape: (B, M, H, W) from SAM2 post_process_masks
|
| 19 |
+
iou_scores: Optional[torch.Tensor] = None, # optional, (B, M)
|
| 20 |
+
threshold: float = 0.5, # binarization for hard mask
|
| 21 |
+
return_mode: str = "single", # "single" → (1,H,W); "multi" → (C,H,W)
|
| 22 |
+
keep_soft: bool = False, # if True → soft [0,1] alpha channel
|
| 23 |
+
) -> torch.Tensor:
|
| 24 |
+
"""
|
| 25 |
+
Convert SAM2 output masks to MatAnyone-ready format.
|
| 26 |
+
|
| 27 |
+
Returns a MatAnyone-ready tensor on the same device:
|
| 28 |
+
- "single": (1,H,W) float32 in [0,1]
|
| 29 |
+
- "multi": (C,H,W) float32 in [0,1]
|
| 30 |
+
"""
|
| 31 |
+
assert sam2_masks.ndim == 4, f"Expect (B,M,H,W). Got {tuple(sam2_masks.shape)}"
|
| 32 |
+
B, M, H, W = sam2_masks.shape
|
| 33 |
+
assert B == 1, "We pass one frame to build first-frame mask."
|
| 34 |
+
|
| 35 |
+
masks = sam2_masks[0] # (M,H,W)
|
| 36 |
+
|
| 37 |
+
# Choose best mask
|
| 38 |
+
if iou_scores is not None and iou_scores.ndim == 2:
|
| 39 |
+
best_idx = int(torch.argmax(iou_scores[0]).item())
|
| 40 |
+
else:
|
| 41 |
+
# Fallback: pick the mask with largest foreground area
|
| 42 |
+
areas = masks.sum(dim=(1,2))
|
| 43 |
+
best_idx = int(torch.argmax(areas).item())
|
| 44 |
+
|
| 45 |
+
if return_mode == "multi":
|
| 46 |
+
out = masks
|
| 47 |
+
else:
|
| 48 |
+
out = masks[best_idx:best_idx+1] # (1,H,W)
|
| 49 |
+
|
| 50 |
+
# Ensure float32 [0,1]
|
| 51 |
+
out = out.to(dtype=torch.float32)
|
| 52 |
+
if not keep_soft:
|
| 53 |
+
out = (out >= threshold).float()
|
| 54 |
+
|
| 55 |
+
# Final sanity: contiguous, shapes
|
| 56 |
+
out = out.contiguous()
|
| 57 |
+
assert out.ndim == 3, f"Expect (C,H,W); got {tuple(out.shape)}"
|
| 58 |
+
assert out.shape[0] >= 1, f"Need at least 1 channel; got {out.shape[0]}"
|
| 59 |
+
|
| 60 |
+
return out # (C,H,W) float32 [0,1]
|