Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28

Commit

d2502a6

1 Parent(s): b6786fa

Update utils/cv_processing.py

Browse files

Files changed (1) hide show

utils/cv_processing.py +109 -82

utils/cv_processing.py CHANGED Viewed

@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 """
 cv_processing.py · FIXED VERSION with proper SAM2 handling + MatAnyone stateful integration
 """
 from __future__ import annotations
@@ -28,41 +31,48 @@
 PROFESSIONAL_BACKGROUNDS = PROFESSIONAL_BACKGROUNDS_LOCAL
 # ----------------------------------------------------------------------------
-# Helpers
 # ----------------------------------------------------------------------------
 def _ensure_rgb(img: np.ndarray) -> np.ndarray:
     if img is None:
         return img
-    if img.ndim == 3 and img.shape[2] == 3:
-        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    return img
-def _ensure_rgb01(frame_bgr: np.ndarray) -> np.ndarray:
     """
-    Convert BGR uint8 [H,W,3] to RGB float32 in [0,1].
-    Accepts a variety of layouts and coerces safely to HWC.
     """
-    if frame_bgr is None:
-        raise ValueError("frame_bgr is None")
-    x = frame_bgr
-    if x.ndim == 2:
-        x = np.stack([x, x, x], axis=-1)  # gray -> 3ch
-    # channels-first -> HWC
-    if x.ndim == 3 and x.shape[0] in (1, 3, 4) and x.shape[-1] not in (1, 3, 4):
-        x = np.transpose(x, (1, 2, 0))
-    if x.dtype != np.uint8:
-        x = np.clip(x, 0, 255).astype(np.uint8)
-    rgb = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
-    return (rgb.astype(np.float32) / 255.0).copy()
 def _to_mask01(m: np.ndarray) -> np.ndarray:
     if m is None:
         return None
-    if m.ndim == 3 and m.shape[2] in (1, 3):
         m = m[..., 0]
-    m = m.astype(np.float32)
-    if m.max() > 1.0:
-        m = m / 255.0
     return np.clip(m, 0.0, 1.0)
 def _mask_to_2d(mask: np.ndarray) -> np.ndarray:
@@ -71,29 +81,31 @@ def _mask_to_2d(mask: np.ndarray) -> np.ndarray:
     Handles HWC/CHW/B1HW/1HW/HW, etc.
     """
     m = np.asarray(mask)
-    # channels-first 1xHxW
     if m.ndim == 3 and m.shape[0] == 1 and (m.shape[1] > 1 and m.shape[2] > 1):
         m = m[0]
-    # channels-last HxWx1
     if m.ndim == 3 and m.shape[-1] == 1:
         m = m[..., 0]
-    # multi-channel -> take first channel
     if m.ndim == 3:
         m = m[..., 0] if m.shape[-1] in (1, 3, 4) else m[0]
-    # squeeze anything left
     m = np.squeeze(m)
     if m.ndim != 2:
         h = int(m.shape[-2]) if m.ndim >= 2 else 512
         w = int(m.shape[-1]) if m.ndim >= 2 else 512
         logger.warning(f"_mask_to_2d: unexpected shape {mask.shape}, creating neutral mask.")
         m = np.full((h, w), 0.5, dtype=np.float32)
-    # dtype/range
     if m.dtype == np.uint8:
         m = m.astype(np.float32) / 255.0
     elif m.dtype != np.float32:
         m = m.astype(np.float32)
-    m = np.clip(m, 0.0, 1.0)
-    return np.ascontiguousarray(m)
 def _feather(mask01: np.ndarray, k: int = 2) -> np.ndarray:
     if mask01.ndim == 3:
@@ -133,17 +145,18 @@ def create_professional_background(key_or_cfg: Any, width: int, height: int) ->
     return _vertical_gradient(dark, color, width, height)
 # ----------------------------------------------------------------------------
-# Improved Segmentation
 # ----------------------------------------------------------------------------
-def _simple_person_segmentation(frame_bgr: np.ndarray) -> np.ndarray:
-    """Basic fallback segmentation using color detection"""
-    h, w = frame_bgr.shape[:2]
-    hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
     lower_skin = np.array([0, 20, 70], dtype=np.uint8)
     upper_skin = np.array([20, 255, 255], dtype=np.uint8)
     skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
     lower_green = np.array([40, 40, 40], dtype=np.uint8)
     upper_green = np.array([80, 255, 255], dtype=np.uint8)
     green_mask = cv2.inRange(hsv, lower_green, upper_green)
@@ -171,65 +184,77 @@ def segment_person_hq(
     **_compat_kwargs,
 ) -> np.ndarray:
     """
-    High-quality person segmentation with proper SAM2 handling
     """
-    h, w = frame.shape[:2]
     if use_sam2 is False:
-        return _simple_person_segmentation(frame)
     if predictor is not None:
         try:
             if hasattr(predictor, "set_image") and hasattr(predictor, "predict"):
-                rgb = _ensure_rgb(frame)
-                predictor.set_image(rgb)
-                points = []
-                labels = []
-                points.append([w // 2, h // 2]); labels.append(1)
-                points.append([w // 2, h // 4]); labels.append(1)
-                points.append([w // 2, h // 2 + h // 8]); labels.append(1)
-                point_coords = np.array(points, dtype=np.float32)
-                point_labels = np.array(labels, dtype=np.int32)
                 result = predictor.predict(
-                    point_coords=point_coords,
-                    point_labels=point_labels,
                     multimask_output=True
                 )
                 if isinstance(result, dict):
                     masks = result.get("masks", None)
                     scores = result.get("scores", None)
-                elif isinstance(result, tuple) and len(result) >= 2:
                     masks, scores = result[0], result[1]
                 else:
-                    masks = result
-                    scores = None
                 if masks is not None:
-                    masks = np.array(masks)
-                    if masks.size > 0:
-                        if masks.ndim == 3 and masks.shape[0] > 0:
-                            if scores is not None and len(scores) > 0:
-                                best_idx = np.argmax(scores)
-                                mask = masks[best_idx]
-                            else:
-                                mask = masks[0]
-                        elif masks.ndim == 2:
-                            mask = masks
                         else:
-                            logger.warning(f"Unexpected mask shape from SAM2: {masks.shape}")
-                            mask = None
-                        if mask is not None:
-                            mask = _to_mask01(mask)
-                            if mask.max() > 0.1:
-                                return mask
-                            else:
-                                logger.warning("SAM2 mask too weak, using fallback")
                 else:
                     logger.warning("SAM2 returned no masks")
@@ -238,7 +263,7 @@ def segment_person_hq(
     if fallback_enabled:
         logger.debug("Using fallback segmentation")
-        return _simple_person_segmentation(frame)
     else:
         return np.ones((h, w), dtype=np.float32)
@@ -276,7 +301,7 @@ def refine_mask_hq(
     if matanyone is not None and callable(matanyone):
         try:
-            rgb01 = _ensure_rgb01(frame)
             # Stateful path (preferred)
             if frame_idx is not None:
@@ -285,7 +310,7 @@ def refine_mask_hq(
                 else:
                     refined = matanyone(rgb01)                 # propagate without mask
                 refined = _mask_to_2d(refined)
-                if refined.max() > 0.1:
                     return _postprocess_mask(refined)
                 logger.warning("MatAnyone stateful refinement produced empty/weak mask; falling back.")
@@ -315,7 +340,7 @@ def refine_mask_hq(
                 except Exception as e:
                     logger.debug(f"MatAnyone process failed: {e}")
-            if refined is not None and refined.max() > 0.1:
                 return _postprocess_mask(refined)
             else:
                 logger.warning("MatAnyone refinement failed or produced empty mask")
@@ -331,7 +356,7 @@ def refine_mask_hq(
 def _postprocess_mask(mask01: np.ndarray) -> np.ndarray:
     """Post-process mask to clean edges and remove artifacts"""
-    mask_uint8 = (mask01 * 255).astype(np.uint8)
     kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel_close)
@@ -342,11 +367,12 @@ def _postprocess_mask(mask01: np.ndarray) -> np.ndarray:
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
-    return mask_uint8.astype(np.float32) / 255.0
 def _fallback_refine(mask01: np.ndarray) -> np.ndarray:
     """Simple fallback refinement"""
-    mask_uint8 = (mask01 * 255).astype(np.uint8)
     mask_uint8 = cv2.bilateralFilter(mask_uint8, 9, 75, 75)
@@ -356,10 +382,11 @@ def _fallback_refine(mask01: np.ndarray) -> np.ndarray:
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
-    return mask_uint8.astype(np.float32) / 255.0
 # ----------------------------------------------------------------------------
-# Compositing
 # ----------------------------------------------------------------------------
 def replace_background_hq(
     frame: np.ndarray,
@@ -368,7 +395,7 @@ def replace_background_hq(
     fallback_enabled: bool = True,
     **_compat,
 ) -> np.ndarray:
-    """High-quality background replacement with alpha blending"""
     try:
         H, W = frame.shape[:2]

 #!/usr/bin/env python3
 """
 cv_processing.py · FIXED VERSION with proper SAM2 handling + MatAnyone stateful integration
+All public functions in this module expect RGB images (H,W,3) unless stated otherwise.
+CoreVideoProcessor already converts BGR→RGB before calling into this module.
 """
 from __future__ import annotations
 PROFESSIONAL_BACKGROUNDS = PROFESSIONAL_BACKGROUNDS_LOCAL
 # ----------------------------------------------------------------------------
+# Helpers (RGB-safe)
 # ----------------------------------------------------------------------------
 def _ensure_rgb(img: np.ndarray) -> np.ndarray:
+    """
+    Identity for RGB HWC images. If channels-first, convert to HWC.
+    DOES NOT perform BGR↔RGB swaps (the caller is responsible for color space).
+    """
     if img is None:
         return img
+    x = np.asarray(img)
+    if x.ndim == 3 and x.shape[-1] in (3, 4):
+        return x[..., :3]
+    if x.ndim == 3 and x.shape[0] in (1, 3, 4) and x.shape[-1] not in (1, 3, 4):
+        return np.transpose(x, (1, 2, 0))[..., :3]
+    return x
+def _ensure_rgb01(frame_rgb: np.ndarray) -> np.ndarray:
     """
+    Convert RGB uint8/float to RGB float32 in [0,1], HWC.
+    No channel swaps are performed.
     """
+    if frame_rgb is None:
+        raise ValueError("frame_rgb is None")
+    x = _ensure_rgb(frame_rgb)
+    if x.dtype == np.uint8:
+        return (x.astype(np.float32) / 255.0).copy()
+    if np.issubdtype(x.dtype, np.floating):
+        return np.clip(x.astype(np.float32), 0.0, 1.0).copy()
+    # other integer types
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    return (x.astype(np.float32) / 255.0).copy()
 def _to_mask01(m: np.ndarray) -> np.ndarray:
     if m is None:
         return None
+    if m.ndim == 3 and m.shape[2] in (1, 3, 4):
         m = m[..., 0]
+    m = np.asarray(m)
+    if m.dtype == np.uint8:
+        m = m.astype(np.float32) / 255.0
+    elif m.dtype != np.float32:
+        m = m.astype(np.float32)
     return np.clip(m, 0.0, 1.0)
 def _mask_to_2d(mask: np.ndarray) -> np.ndarray:
     Handles HWC/CHW/B1HW/1HW/HW, etc.
     """
     m = np.asarray(mask)
+    # CHW with single channel
     if m.ndim == 3 and m.shape[0] == 1 and (m.shape[1] > 1 and m.shape[2] > 1):
         m = m[0]
+    # HWC with single channel
     if m.ndim == 3 and m.shape[-1] == 1:
         m = m[..., 0]
+    # generic 3D -> take first channel
     if m.ndim == 3:
         m = m[..., 0] if m.shape[-1] in (1, 3, 4) else m[0]
     m = np.squeeze(m)
     if m.ndim != 2:
+        # fall back to neutral 0.5 mask
         h = int(m.shape[-2]) if m.ndim >= 2 else 512
         w = int(m.shape[-1]) if m.ndim >= 2 else 512
         logger.warning(f"_mask_to_2d: unexpected shape {mask.shape}, creating neutral mask.")
         m = np.full((h, w), 0.5, dtype=np.float32)
     if m.dtype == np.uint8:
         m = m.astype(np.float32) / 255.0
     elif m.dtype != np.float32:
         m = m.astype(np.float32)
+    return np.ascontiguousarray(np.clip(m, 0.0, 1.0))
 def _feather(mask01: np.ndarray, k: int = 2) -> np.ndarray:
     if mask01.ndim == 3:
     return _vertical_gradient(dark, color, width, height)
 # ----------------------------------------------------------------------------
+# Improved Segmentation (expects RGB input)
 # ----------------------------------------------------------------------------
+def _simple_person_segmentation(frame_rgb: np.ndarray) -> np.ndarray:
+    """Basic fallback segmentation using color detection on RGB frames."""
+    h, w = frame_rgb.shape[:2]
+    hsv = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2HSV)
     lower_skin = np.array([0, 20, 70], dtype=np.uint8)
     upper_skin = np.array([20, 255, 255], dtype=np.uint8)
     skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
+    # detect greenscreen-ish
     lower_green = np.array([40, 40, 40], dtype=np.uint8)
     upper_green = np.array([80, 255, 255], dtype=np.uint8)
     green_mask = cv2.inRange(hsv, lower_green, upper_green)
     **_compat_kwargs,
 ) -> np.ndarray:
     """
+    High-quality person segmentation with proper SAM2 handling.
+    Expects RGB frame (H,W,3), uint8 or float in [0,1].
     """
+    frame_rgb = _ensure_rgb(frame)
+    h, w = frame_rgb.shape[:2]
     if use_sam2 is False:
+        return _simple_person_segmentation(frame_rgb)
     if predictor is not None:
         try:
             if hasattr(predictor, "set_image") and hasattr(predictor, "predict"):
+                # Predictor adapter expects RGB uint8; convert if needed
+                if frame_rgb.dtype != np.uint8:
+                    rgb_u8 = np.clip(frame_rgb * (255.0 if frame_rgb.dtype != np.uint8 else 1.0), 0, 255).astype(np.uint8) \
+                             if np.issubdtype(frame_rgb.dtype, np.floating) else frame_rgb.astype(np.uint8)
+                else:
+                    rgb_u8 = frame_rgb
+                predictor.set_image(rgb_u8)
+                # Center + a couple of body-biased prompts
+                points = np.array([
+                    [w // 2, h // 2],
+                    [w // 2, h // 4],
+                    [w // 2, h // 2 + h // 8],
+                ], dtype=np.float32)
+                labels = np.array([1, 1, 1], dtype=np.int32)
                 result = predictor.predict(
+                    point_coords=points,
+                    point_labels=labels,
                     multimask_output=True
                 )
+                # normalize outputs
                 if isinstance(result, dict):
                     masks = result.get("masks", None)
                     scores = result.get("scores", None)
+                elif isinstance(result, (tuple, list)) and len(result) >= 2:
                     masks, scores = result[0], result[1]
                 else:
+                    masks, scores = result, None
                 if masks is not None:
+                    masks = np.asarray(masks)
+                    if masks.ndim == 2:
+                        mask = masks
+                    elif masks.ndim == 3 and masks.shape[0] > 0:
+                        if scores is not None:
+                            best_idx = int(np.argmax(np.asarray(scores)))
+                            mask = masks[best_idx]
+                        else:
+                            mask = masks[0]
+                    elif masks.ndim == 4 and masks.shape[1] == 1:
+                        # (N,1,H,W)
+                        if scores is not None:
+                            best_idx = int(np.argmax(np.asarray(scores)))
+                            mask = masks[best_idx, 0]
+                        else:
+                            mask = masks[0, 0]
+                    else:
+                        logger.warning(f"Unexpected mask shape from SAM2: {masks.shape}")
+                        mask = None
+                    if mask is not None:
+                        mask = _to_mask01(mask)
+                        if float(mask.max()) > 0.1:
+                            return np.ascontiguousarray(mask)
                         else:
+                            logger.warning("SAM2 mask too weak, using fallback")
                 else:
                     logger.warning("SAM2 returned no masks")
     if fallback_enabled:
         logger.debug("Using fallback segmentation")
+        return _simple_person_segmentation(frame_rgb)
     else:
         return np.ones((h, w), dtype=np.float32)
     if matanyone is not None and callable(matanyone):
         try:
+            rgb01 = _ensure_rgb01(frame)  # RGB float32 in [0,1]
             # Stateful path (preferred)
             if frame_idx is not None:
                 else:
                     refined = matanyone(rgb01)                 # propagate without mask
                 refined = _mask_to_2d(refined)
+                if float(refined.max()) > 0.1:
                     return _postprocess_mask(refined)
                 logger.warning("MatAnyone stateful refinement produced empty/weak mask; falling back.")
                 except Exception as e:
                     logger.debug(f"MatAnyone process failed: {e}")
+            if refined is not None and float(refined.max()) > 0.1:
                 return _postprocess_mask(refined)
             else:
                 logger.warning("MatAnyone refinement failed or produced empty mask")
 def _postprocess_mask(mask01: np.ndarray) -> np.ndarray:
     """Post-process mask to clean edges and remove artifacts"""
+    mask_uint8 = (np.clip(mask01, 0, 1) * 255).astype(np.uint8)
     kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel_close)
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
+    out = mask_uint8.astype(np.float32) / 255.0
+    return np.ascontiguousarray(out)
 def _fallback_refine(mask01: np.ndarray) -> np.ndarray:
     """Simple fallback refinement"""
+    mask_uint8 = (np.clip(mask01, 0, 1) * 255).astype(np.uint8)
     mask_uint8 = cv2.bilateralFilter(mask_uint8, 9, 75, 75)
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
+    out = mask_uint8.astype(np.float32) / 255.0
+    return np.ascontiguousarray(out)
 # ----------------------------------------------------------------------------
+# Compositing (expects RGB inputs)
 # ----------------------------------------------------------------------------
 def replace_background_hq(
     frame: np.ndarray,
     fallback_enabled: bool = True,
     **_compat,
 ) -> np.ndarray:
+    """High-quality background replacement with alpha blending (RGB in/out)."""
     try:
         H, W = frame.shape[:2]