MogensR commited on
Commit
31fec2e
·
1 Parent(s): 4055558

Update processing/video/video_processor.py

Browse files
Files changed (1) hide show
  1. processing/video/video_processor.py +809 -0
processing/video/video_processor.py CHANGED
@@ -60,9 +60,16 @@ def _to_rgb(c):
60
  return (255, 255, 255)
61
 
62
  def _create_gradient_background_local(spec: Dict[str, Any], width: int, height: int) -> np.ndarray:
 
 
 
 
 
63
  start = _to_rgb(spec.get("start", "#222222"))
64
  end = _to_rgb(spec.get("end", "#888888"))
65
  angle = float(spec.get("angle_deg", 0))
 
 
66
  bg = np.zeros((height, width, 3), np.uint8)
67
  for y in range(height):
68
  t = y / max(1, height - 1)
@@ -70,6 +77,808 @@ def _create_gradient_background_local(spec: Dict[str, Any], width: int, height:
70
  g = int(start[1]*(1-t) + end[1]*t)
71
  b = int(start[2]*(1-t) + end[2]*t)
72
  bg[y, :] = (r, g, b)
 
73
  if abs(angle) % 360 < 1e-6:
74
  return bg
 
 
75
  center = (width / 2, height / 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  return (255, 255, 255)
61
 
62
  def _create_gradient_background_local(spec: Dict[str, Any], width: int, height: int) -> np.ndarray:
63
+ """
64
+ Minimal gradient generator for backgrounds (linear with rotation).
65
+ spec = {"type": "linear"|"radial"(ignored), "start": (r,g,b)|"#rrggbb", "end": ..., "angle_deg": float}
66
+ Returns RGB np.uint8 (H,W,3)
67
+ """
68
  start = _to_rgb(spec.get("start", "#222222"))
69
  end = _to_rgb(spec.get("end", "#888888"))
70
  angle = float(spec.get("angle_deg", 0))
71
+
72
+ # build vertical gradient
73
  bg = np.zeros((height, width, 3), np.uint8)
74
  for y in range(height):
75
  t = y / max(1, height - 1)
 
77
  g = int(start[1]*(1-t) + end[1]*t)
78
  b = int(start[2]*(1-t) + end[2]*t)
79
  bg[y, :] = (r, g, b)
80
+
81
  if abs(angle) % 360 < 1e-6:
82
  return bg
83
+
84
+ # rotate by angle using OpenCV (RGB-safe)
85
  center = (width / 2, height / 2)
86
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
87
+ rot = cv2.warpAffine(bg, M, (width, height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT_101)
88
+ return rot
89
+
90
+
91
+ @dataclass
92
+ class ProcessorConfig:
93
+ background_preset: str = "office" # key in PROFESSIONAL_BACKGROUNDS
94
+ write_fps: Optional[float] = None # None -> keep source fps
95
+
96
+ # Model-only downscale (speedup without changing output resolution)
97
+ max_model_size: Optional[int] = 1280
98
+
99
+ # FFmpeg / NVENC output (pipe). If disabled or unavailable, use OpenCV writer.
100
+ use_nvenc: bool = True
101
+ nvenc_codec: str = "h264" # "h264" or "hevc"
102
+ nvenc_preset: str = "p5" # NVENC preset string
103
+ nvenc_cq: int = 18 # constant quality (lower = higher quality)
104
+ nvenc_tune_hq: bool = True
105
+ nvenc_pix_fmt: str = "yuv420p" # browser-safe
106
+
107
+ # libx264 fallback
108
+ x264_preset: str = "medium"
109
+ x264_crf: int = 18
110
+ x264_pix_fmt: str = "yuv420p"
111
+
112
+ movflags_faststart: bool = True
113
+
114
+ # ---------- stability & edge quality ----------
115
+ temporal_ema_alpha: float = 0.75 # higher = calmer (0.6–0.85 typical)
116
+ min_iou_to_accept: float = 0.05 # reject sudden mask jumps
117
+ dilate_px: int = 6 # pad edges to keep hair/ears/shoulders
118
+ edge_blur_px: int = 1 # tiny blur to calm edge shimmer
119
+
120
+ # hardening (turn soft mask into crisper 0/1)
121
+ hard_low: float = 0.35 # values below -> 0
122
+ hard_high: float = 0.70 # values above -> 1
123
+ mask_gamma: float = 0.90 # <1 boosts mid-tones slightly
124
+
125
+ # ---------- windowed two-phase control ----------
126
+ use_windowed: bool = True # enable two-phase SAM2→MatAnyone per chunk
127
+ window_size: int = 8 # frames per window
128
+
129
+ # Back-compat name used elsewhere in the app
130
+ ProcessingConfig = ProcessorConfig
131
+
132
+
133
+ class _FFmpegPipe:
134
+ """
135
+ Wrapper around an FFmpeg stdin pipe with encoder fallbacks and good error messages.
136
+ """
137
+
138
+ def __init__(self, width: int, height: int, fps: float, out_path: str, cfg: ProcessorConfig, log=_log):
139
+ self.width = int(width)
140
+ self.height = int(height)
141
+ self.fps = float(fps) if fps and fps > 0 else 25.0
142
+ self.out_path = out_path
143
+ self.cfg = cfg
144
+ self.log = log
145
+
146
+ self.proc: Optional[subprocess.Popen] = None
147
+ self.encoder_used: Optional[str] = None
148
+ self._stderr: bytes | None = None
149
+
150
+ self._ffmpeg = shutil.which("ffmpeg")
151
+ if not self._ffmpeg:
152
+ raise RuntimeError("ffmpeg not found on PATH")
153
+
154
+ self._start_with_fallbacks()
155
+
156
+ def _cmd_for_encoder(self, encoder: str) -> list[str]:
157
+ base = [
158
+ self._ffmpeg,
159
+ "-hide_banner", "-loglevel", "error",
160
+ "-y",
161
+ # rawvideo input from stdin
162
+ "-f", "rawvideo",
163
+ "-vcodec", "rawvideo",
164
+ "-pix_fmt", "bgr24",
165
+ "-s", f"{self.width}x{self.height}",
166
+ "-r", f"{self.fps}",
167
+ "-i", "-", # stdin
168
+ "-an", # no audio here
169
+ ]
170
+ if self.cfg.movflags_faststart:
171
+ base += ["-movflags", "+faststart"]
172
+
173
+ if encoder == "h264_nvenc":
174
+ base += [
175
+ "-c:v", "h264_nvenc",
176
+ "-preset", self.cfg.nvenc_preset,
177
+ "-cq", str(int(self.cfg.nvenc_cq)),
178
+ "-pix_fmt", self.cfg.nvenc_pix_fmt,
179
+ ]
180
+ if self.cfg.nvenc_tune_hq:
181
+ base += ["-tune", "hq"]
182
+ elif encoder == "hevc_nvenc":
183
+ base += [
184
+ "-c:v", "hevc_nvenc",
185
+ "-preset", self.cfg.nvenc_preset,
186
+ "-cq", str(int(self.cfg.nvenc_cq)),
187
+ "-pix_fmt", self.cfg.nvenc_pix_fmt,
188
+ ]
189
+ if self.cfg.nvenc_tune_hq:
190
+ base += ["-tune", "hq"]
191
+ elif encoder == "libx264":
192
+ base += [
193
+ "-c:v", "libx264",
194
+ "-preset", self.cfg.x264_preset,
195
+ "-crf", str(int(self.cfg.x264_crf)),
196
+ "-pix_fmt", self.cfg.x264_pix_fmt,
197
+ ]
198
+ elif encoder == "mpeg4":
199
+ base += [
200
+ "-c:v", "mpeg4",
201
+ "-q:v", "2",
202
+ "-pix_fmt", "yuv420p",
203
+ ]
204
+ else:
205
+ base += ["-c:v", "libx264", "-preset", self.cfg.x264_preset, "-crf", str(int(self.cfg.x264_crf)), "-pix_fmt", self.cfg.x264_pix_fmt]
206
+
207
+ base += [self.out_path]
208
+ return base
209
+
210
+ def _try_start(self, enc: str) -> bool:
211
+ cmd = self._cmd_for_encoder(enc)
212
+ try:
213
+ self.proc = subprocess.Popen(
214
+ cmd,
215
+ stdin=subprocess.PIPE,
216
+ stderr=subprocess.PIPE,
217
+ bufsize=10**7,
218
+ )
219
+ self.encoder_used = enc
220
+ self.log.info("FFmpeg started: %s", " ".join(shlex.quote(c) for c in cmd))
221
+ # quick poll: if ffmpeg dies immediately, fail fast
222
+ time.sleep(0.05)
223
+ if self.proc.poll() is not None:
224
+ self._stderr = self.proc.stderr.read() if self.proc.stderr else b""
225
+ self.log.warning("FFmpeg exited on start with %s: %s", enc, (self._stderr or b"").decode(errors="ignore"))
226
+ self.proc = None
227
+ return False
228
+ return True
229
+ except Exception as e:
230
+ self.log.warning("Failed to start FFmpeg with %s: %s", enc, e)
231
+ self.proc = None
232
+ return False
233
+
234
+ def _start_with_fallbacks(self):
235
+ encoders = []
236
+ if self.cfg.use_nvenc:
237
+ encoders += ["h264_nvenc"] if self.cfg.nvenc_codec.lower() == "h264" else ["hevc_nvenc"]
238
+ encoders += ["libx264", "mpeg4"]
239
+ for enc in encoders:
240
+ if self._try_start(enc):
241
+ return
242
+ msg = "Could not start FFmpeg with any encoder (nvenc/libx264/mpeg4). Is ffmpeg present and codecs available?"
243
+ if self._stderr:
244
+ msg += f" Stderr: {(self._stderr or b'').decode(errors='ignore')[:500]}"
245
+ raise RuntimeError(msg)
246
+
247
+ def write(self, frame_bgr: np.ndarray):
248
+ if self.proc is None or self.proc.stdin is None:
249
+ raise RuntimeError("FFmpeg process is not running (stdin is None).")
250
+ if not isinstance(frame_bgr, np.ndarray) or frame_bgr.dtype != np.uint8:
251
+ raise ValueError("Frame must be a np.ndarray of dtype uint8.")
252
+ if frame_bgr.ndim != 3 or frame_bgr.shape[2] != 3:
253
+ raise ValueError("Frame must have shape (H, W, 3).")
254
+ if frame_bgr.shape[0] != self.height or frame_bgr.shape[1] != self.width:
255
+ raise ValueError(f"Frame size mismatch. Expected {self.width}x{self.height}, got {frame_bgr.shape[1]}x{frame_bgr.shape[0]}.")
256
+
257
+ # ensure contiguous for tobytes()
258
+ frame_bgr = np.ascontiguousarray(frame_bgr)
259
+ try:
260
+ self.proc.stdin.write(frame_bgr.tobytes())
261
+ except Exception as e:
262
+ # collect stderr for diagnostics
263
+ stderr = b""
264
+ try:
265
+ if self.proc and self.proc.stderr:
266
+ stderr = self.proc.stderr.read()
267
+ except Exception:
268
+ pass
269
+ msg = f"FFmpeg pipe write failed: {e}"
270
+ if stderr:
271
+ msg += f"\nffmpeg stderr: {(stderr or b'').decode(errors='ignore')[:1000]}"
272
+ raise BrokenPipeError(msg)
273
+
274
+ def close(self):
275
+ if self.proc is None:
276
+ return
277
+ try:
278
+ if self.proc.stdin:
279
+ try:
280
+ self.proc.stdin.flush()
281
+ except Exception:
282
+ pass
283
+ try:
284
+ self.proc.stdin.close()
285
+ except Exception:
286
+ pass
287
+ # drain a bit of stderr for logs
288
+ if self.proc.stderr:
289
+ try:
290
+ err = self.proc.stderr.read()
291
+ if err:
292
+ self.log.debug("FFmpeg stderr (tail): %s", err.decode(errors="ignore")[-2000:])
293
+ except Exception:
294
+ pass
295
+ self.proc.wait(timeout=10)
296
+ except Exception:
297
+ try:
298
+ self.proc.kill()
299
+ except Exception:
300
+ pass
301
+ finally:
302
+ self.proc = None
303
+
304
+
305
+ class CoreVideoProcessor:
306
+ """
307
+ Minimal, safe implementation used by core/app.py.
308
+ It relies on a models provider (e.g., ModelLoader) that implements:
309
+ - get_sam2()
310
+ - get_matanyone()
311
+ and uses utils.cv_processing for the pipeline.
312
+
313
+ Supports progress callback and cancellation via stop_event.
314
+ """
315
+
316
+ def __init__(self, config: Optional[ProcessorConfig] = None, models: Optional[Any] = None):
317
+ self.log = _log
318
+ self.config = config or ProcessorConfig()
319
+ self.models = models # do NOT load here; core/app handles loading
320
+ if self.models is None:
321
+ self.log.warning("CoreVideoProcessor initialized without a models provider; will use fallbacks.")
322
+ self._ffmpeg = shutil.which("ffmpeg")
323
+
324
+ # state for temporal smoothing
325
+ self._prev_mask: Optional[np.ndarray] = None
326
+
327
+ # --- ENV overrides (tunable without code change) ---
328
+ try:
329
+ if "MATANYONE_WINDOWED" in os.environ:
330
+ self.config.use_windowed = os.environ["MATANYONE_WINDOWED"].strip().lower() not in ("0", "false", "no")
331
+ if "MATANYONE_WINDOW" in os.environ:
332
+ self.config.window_size = max(1, int(os.environ["MATANYONE_WINDOW"]))
333
+ if "MAX_MODEL_SIZE" in os.environ:
334
+ self.config.max_model_size = max(0, int(os.environ["MAX_MODEL_SIZE"]))
335
+ except Exception:
336
+ pass
337
+
338
+ # Legacy per-frame stateful chunking (used only if use_windowed=False)
339
+ try:
340
+ self._chunk_size = max(1, int(os.environ.get("MATANYONE_CHUNK", "12")))
341
+ except Exception:
342
+ self._chunk_size = 12
343
+ self._chunk_idx = 0
344
+
345
+ # ---------- mask post-processing (stability + crispness) ----------
346
+ def _iou(self, a: np.ndarray, b: np.ndarray, thr: float = 0.5) -> float:
347
+ a_bin = (a >= thr).astype(np.uint8)
348
+ b_bin = (b >= thr).astype(np.uint8)
349
+ inter = np.count_nonzero(cv2.bitwise_and(a_bin, b_bin))
350
+ union = np.count_nonzero(cv2.bitwise_or(a_bin, b_bin))
351
+ return (inter / union) if union else 0.0
352
+
353
+ def _harden(self, m: np.ndarray) -> np.ndarray:
354
+ # optional gamma
355
+ g = float(self.config.mask_gamma)
356
+ if abs(g - 1.0) > 1e-6:
357
+ m = np.clip(m, 0, 1) ** g
358
+
359
+ lo = float(self.config.hard_low)
360
+ hi = float(self.config.hard_high)
361
+ if hi > lo + 1e-6:
362
+ m = (m - lo) / (hi - lo)
363
+ m = np.clip(m, 0.0, 1.0)
364
+
365
+ # pad edges then tiny blur
366
+ k = int(self.config.dilate_px)
367
+ if k > 0:
368
+ se = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*k+1, 2*k+1))
369
+ m = cv2.dilate(m, se, iterations=1)
370
+
371
+ eb = int(self.config.edge_blur_px)
372
+ if eb > 0:
373
+ m = cv2.GaussianBlur(m, (2*eb+1, 2*eb+1), 0)
374
+
375
+ return np.clip(m, 0.0, 1.0)
376
+
377
+ def _stabilize(self, m: np.ndarray) -> np.ndarray:
378
+ if self._prev_mask is None:
379
+ self._prev_mask = m
380
+ return m
381
+
382
+ # outlier rejection
383
+ if self._iou(self._prev_mask, m, 0.5) < float(self.config.min_iou_to_accept):
384
+ # ignore this frame's mask → keep previous
385
+ return self._prev_mask
386
+
387
+ # EMA
388
+ a = float(self.config.temporal_ema_alpha)
389
+ m_ema = a * self._prev_mask + (1.0 - a) * m
390
+ self._prev_mask = m_ema
391
+ return m_ema
392
+
393
+ # ---------- Single frame (fallback path) ----------
394
+ def process_frame(self, frame_bgr: np.ndarray, background_rgb: np.ndarray) -> Dict[str, Any]:
395
+ """
396
+ Process one frame (legacy per-frame path):
397
+ - optionally downscale for model work,
398
+ - segment + refine,
399
+ - temporal stabilize + harden,
400
+ - upsample mask,
401
+ - composite full-res.
402
+ Returns dict with composited frame (BGR for writer) and mask (H,W float).
403
+ """
404
+ H, W = frame_bgr.shape[:2]
405
+ max_side = max(H, W)
406
+ scale = 1.0
407
+ proc_frame_bgr = frame_bgr
408
+
409
+ # Model-only downscale
410
+ if self.config.max_model_size and max_side > self.config.max_model_size:
411
+ scale = self.config.max_model_size / float(max_side)
412
+ newW = int(round(W * scale))
413
+ newH = int(round(H * scale))
414
+ proc_frame_bgr = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
415
+ self.log.debug(f"Model-only downscale: {W}x{H} -> {newW}x{newH} (scale={scale:.3f})")
416
+
417
+ # RGB for models
418
+ proc_frame_rgb = cv2.cvtColor(proc_frame_bgr, cv2.COLOR_BGR2RGB)
419
+
420
+ predictor = None
421
+ try:
422
+ if self.models and hasattr(self.models, "get_sam2"):
423
+ predictor = self.models.get_sam2()
424
+ except Exception as e:
425
+ self.log.warning(f"SAM2 predictor unavailable: {e}")
426
+
427
+ # 1) segmentation (with internal fallbacks)
428
+ mask_small = segment_person_hq(proc_frame_rgb, predictor, use_sam2=True)
429
+
430
+ # 2) refinement (MatAnyOne if available) — stateful chunking
431
+ matanyone = None
432
+ try:
433
+ if self.models and hasattr(self.models, "get_matanyone"):
434
+ matanyone = self.models.get_matanyone()
435
+ except Exception as e:
436
+ self.log.warning(f"MatAnyOne unavailable: {e}")
437
+
438
+ if matanyone is not None and hasattr(matanyone, "reset") and self._chunk_idx == 0:
439
+ try:
440
+ matanyone.reset()
441
+ except Exception:
442
+ pass
443
+
444
+ # IMPORTANT: call order is (frame, mask, matanyone=...)
445
+ mask_small_ref = refine_mask_hq(
446
+ proc_frame_rgb,
447
+ mask_small,
448
+ matanyone=matanyone,
449
+ use_matanyone=True,
450
+ frame_idx=self._chunk_idx, # enable stateful first-frame + propagate
451
+ )
452
+
453
+ # advance chunk + optional defrag
454
+ self._chunk_idx = (self._chunk_idx + 1) % max(1, self._chunk_size)
455
+ if self._chunk_idx == 0:
456
+ try:
457
+ import torch
458
+ if torch.cuda.is_available():
459
+ torch.cuda.empty_cache()
460
+ except Exception:
461
+ pass
462
+
463
+ # Stabilize + harden at model scale
464
+ mask_small_ref = np.clip(mask_small_ref.astype(np.float32), 0.0, 1.0)
465
+ mask_stable = self._stabilize(mask_small_ref)
466
+ mask_stable = self._harden(mask_stable)
467
+
468
+ # Upsample mask back to full-res
469
+ if scale != 1.0:
470
+ mask_full = cv2.resize(mask_stable, (W, H), interpolation=cv2.INTER_LINEAR)
471
+ else:
472
+ mask_full = mask_stable
473
+
474
+ # 3) compositing (helpers expect RGB inputs; return RGB)
475
+ frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
476
+ out_rgb = replace_background_hq(frame_rgb, mask_full, background_rgb)
477
+
478
+ # Convert to BGR for writer
479
+ out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
480
+ return {"frame": out_bgr, "mask": mask_full}
481
+
482
+ # ---------- Build background once per video ----------
483
+ def _prepare_background_from_config(
484
+ self,
485
+ bg_config: Optional[Dict[str, Any]],
486
+ width: int,
487
+ height: int
488
+ ) -> np.ndarray:
489
+ """
490
+ Accepts either:
491
+ - {"custom_path": "/path/to/image.png"} → load image (RGB out)
492
+ - {"background_choice": "office"} → preset
493
+ - {"gradient": {type,start,end,angle_deg}} → generated gradient
494
+ Returns RGB np.uint8
495
+ """
496
+ # 1) custom image?
497
+ if bg_config and bg_config.get("custom_path"):
498
+ path = bg_config["custom_path"]
499
+ img_bgr = cv2.imread(path, cv2.IMREAD_COLOR)
500
+ if img_bgr is None:
501
+ self.log.warning(f"Custom background at '{path}' could not be read. Falling back to preset.")
502
+ else:
503
+ img_bgr = cv2.resize(img_bgr, (width, height), interpolation=cv2.INTER_LANCZOS4)
504
+ return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
505
+
506
+ # 2) gradient?
507
+ if bg_config and isinstance(bg_config.get("gradient"), dict):
508
+ try:
509
+ return _create_gradient_background_local(bg_config["gradient"], width, height)
510
+ except Exception as e:
511
+ self.log.warning(f"Gradient generation failed: {e}. Falling back to preset.")
512
+
513
+ # 3) preset (explicit choice or default)
514
+ choice = None
515
+ if bg_config and "background_choice" in bg_config:
516
+ choice = bg_config["background_choice"]
517
+ if not choice:
518
+ choice = self.config.background_preset
519
+
520
+ if choice not in PROFESSIONAL_BACKGROUNDS:
521
+ self.log.warning(f"Unknown background preset '{choice}'; using 'office'.")
522
+ choice = "office"
523
+
524
+ return create_professional_background(choice, width, height) # RGB
525
+
526
+ # ---------- Windowed two-phase helpers ----------
527
+ def _model_downscale(self, frame_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
528
+ """Apply model-only downscale; return (resized_bgr, scale)."""
529
+ H, W = frame_bgr.shape[:2]
530
+ max_side = max(H, W)
531
+ if self.config.max_model_size and max_side > self.config.max_model_size:
532
+ s = self.config.max_model_size / float(max_side)
533
+ newW = int(round(W * s))
534
+ newH = int(round(H * s))
535
+ small = cv2.resize(frame_bgr, (newW, newH), interpolation=cv2.INTER_AREA)
536
+ return small, s
537
+ return frame_bgr, 1.0
538
+
539
+ def _prepare_sam2_gpu(self, predictor):
540
+ """Best-effort: ensure SAM2 is on CUDA before SAM2 phase."""
541
+ try:
542
+ import torch # local import to avoid hard dependency at import-time
543
+ if predictor is None or not torch.cuda.is_available():
544
+ return
545
+ # Try common patterns
546
+ if hasattr(predictor, "to"):
547
+ try:
548
+ predictor.to("cuda") # type: ignore[attr-defined]
549
+ return
550
+ except Exception:
551
+ pass
552
+ if hasattr(predictor, "model") and hasattr(predictor.model, "to"):
553
+ try:
554
+ predictor.model.to("cuda") # type: ignore[attr-defined]
555
+ except Exception:
556
+ pass
557
+ except Exception:
558
+ pass
559
+
560
+ def _release_sam2_gpu(self, predictor):
561
+ """Best-effort release of SAM2 GPU residency between phases."""
562
+ try:
563
+ if predictor is None:
564
+ return
565
+ # Clear any sticky per-image state if exposed
566
+ for name in ("reset_image", "release_image", "clear_image", "clear_state"):
567
+ if hasattr(predictor, name) and callable(getattr(predictor, name)):
568
+ try:
569
+ getattr(predictor, name)()
570
+ except Exception:
571
+ pass
572
+ # Try moving large parts off-GPU (best-effort, may be no-op)
573
+ for name in ("to", "cpu"):
574
+ if hasattr(predictor, name):
575
+ try:
576
+ if name == "to":
577
+ predictor.to("cpu") # type: ignore[attr-defined]
578
+ else:
579
+ predictor.cpu() # type: ignore[attr-defined]
580
+ except Exception:
581
+ pass
582
+ except Exception:
583
+ pass
584
+ try:
585
+ import torch
586
+ if torch.cuda.is_available():
587
+ torch.cuda.empty_cache()
588
+ except Exception:
589
+ pass
590
+
591
+ # ---------- Full video ----------
592
+ def process_video(
593
+ self,
594
+ input_path: str,
595
+ output_path: str,
596
+ bg_config: Optional[Dict[str, Any]] = None,
597
+ progress_callback: Optional[Callable[[int, int, float], None]] = None,
598
+ stop_event: Optional[threading.Event] = None
599
+ ) -> Dict[str, Any]:
600
+ """
601
+ Process a full video with live progress and optional cancel.
602
+ progress_callback(current_frame, total_frames, fps_live)
603
+ """
604
+ ok, msg = validate_video_file(input_path)
605
+ if not ok:
606
+ raise ValueError(f"Invalid or unreadable video: {msg}")
607
+
608
+ cap = cv2.VideoCapture(input_path)
609
+ if not cap.isOpened():
610
+ raise RuntimeError(f"Could not open video: {input_path}")
611
+
612
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
613
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
614
+ fps = cap.get(cv2.CAP_PROP_FPS)
615
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
616
+
617
+ fps_out = self.config.write_fps or (fps if fps and fps > 0 else 25.0)
618
+
619
+ # Background once (RGB)
620
+ background_rgb = self._prepare_background_from_config(bg_config, width, height)
621
+
622
+ # reset temporal state for a new video
623
+ self._prev_mask = None
624
+
625
+ # Writer selection
626
+ ffmpeg_pipe: _FFmpegPipe | None = None
627
+ writer: cv2.VideoWriter | None = None
628
+ ffmpeg_failed_reason = None
629
+
630
+ if self.config.use_nvenc and self._ffmpeg:
631
+ try:
632
+ ffmpeg_pipe = _FFmpegPipe(width, height, float(fps_out), output_path, self.config, log=self.log)
633
+ except Exception as e:
634
+ ffmpeg_failed_reason = str(e)
635
+ self.log.warning("FFmpeg NVENC pipeline unavailable. Falling back to OpenCV. Reason: %s", e)
636
+
637
+ if ffmpeg_pipe is None:
638
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
639
+ writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
640
+ if not writer.isOpened():
641
+ cap.release()
642
+ raise RuntimeError(f"Could not open VideoWriter for: {output_path}")
643
+
644
+ # Determine models and decide execution mode
645
+ predictor = None
646
+ matanyone = None
647
+ try:
648
+ if self.models and hasattr(self.models, "get_sam2"):
649
+ predictor = self.models.get_sam2()
650
+ except Exception as e:
651
+ self.log.warning(f"SAM2 predictor unavailable: {e}")
652
+
653
+ try:
654
+ if self.models and hasattr(self.models, "get_matanyone"):
655
+ matanyone = self.models.get_matanyone()
656
+ except Exception as e:
657
+ self.log.warning(f"MatAnyOne unavailable: {e}")
658
+
659
+ use_windowed = bool(self.config.use_windowed and predictor is not None and matanyone is not None)
660
+
661
+ frame_count = 0
662
+ start_time = time.time()
663
+
664
+ try:
665
+ if not use_windowed:
666
+ # --------- Legacy per-frame path (fallback) ----------
667
+ while True:
668
+ ret, frame_bgr = cap.read()
669
+ if not ret:
670
+ break
671
+ if stop_event is not None and stop_event.is_set():
672
+ self.log.info("Processing stopped by user request.")
673
+ break
674
+
675
+ result = self.process_frame(frame_bgr, background_rgb)
676
+ out_bgr = np.ascontiguousarray(result["frame"])
677
+
678
+ if ffmpeg_pipe is not None:
679
+ try:
680
+ ffmpeg_pipe.write(out_bgr)
681
+ except Exception as e:
682
+ self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
683
+ try:
684
+ ffmpeg_pipe.close()
685
+ except Exception:
686
+ pass
687
+ ffmpeg_pipe = None
688
+ if writer is None:
689
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
690
+ writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
691
+ if not writer.isOpened():
692
+ raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
693
+ writer.write(out_bgr)
694
+ else:
695
+ writer.write(out_bgr)
696
+
697
+ frame_count += 1
698
+ if progress_callback:
699
+ elapsed = time.time() - start_time
700
+ fps_live = frame_count / elapsed if elapsed > 0 else 0.0
701
+ try: progress_callback(frame_count, total_frames, fps_live)
702
+ except Exception: pass
703
+
704
+ else:
705
+ # --------- Windowed two-phase path ----------
706
+ WINDOW = max(1, int(self.config.window_size))
707
+
708
+ while True:
709
+ # Read a window of frames
710
+ frames_bgr: List[np.ndarray] = []
711
+ for _ in range(WINDOW):
712
+ ret, fr = cap.read()
713
+ if not ret:
714
+ break
715
+ frames_bgr.append(fr)
716
+
717
+ if not frames_bgr:
718
+ break # no more frames
719
+
720
+ if stop_event is not None and stop_event.is_set():
721
+ self.log.info("Processing stopped by user request.")
722
+ break
723
+
724
+ # Model-only downscale frames for model work (consistent per window)
725
+ frames_small_bgr: List[np.ndarray] = []
726
+ scales: List[float] = []
727
+ for fr in frames_bgr:
728
+ fr_small, s = self._model_downscale(fr)
729
+ frames_small_bgr.append(fr_small)
730
+ scales.append(s)
731
+ # Use the first scale (frames normally same size)
732
+ scale = scales[0] if scales else 1.0
733
+
734
+ # Convert small frames to RGB for models
735
+ frames_small_rgb = [cv2.cvtColor(fb, cv2.COLOR_BGR2RGB) for fb in frames_small_bgr]
736
+
737
+ # -------- SAM2 phase (prime with first frame's mask) --------
738
+ self._prepare_sam2_gpu(predictor)
739
+ try:
740
+ mask_small = segment_person_hq(frames_small_rgb[0], predictor, use_sam2=True)
741
+ except Exception as e:
742
+ self.log.warning(f"SAM2 segmentation error on window start: {e}")
743
+ mask_small = segment_person_hq(frames_small_rgb[0], None, use_sam2=False)
744
+
745
+ # Release SAM2 GPU residency before MatAnyone phase
746
+ self._release_sam2_gpu(predictor)
747
+
748
+ # -------- MatAnyone phase (prime + propagate) --------
749
+ if hasattr(matanyone, "reset"):
750
+ try:
751
+ matanyone.reset()
752
+ except Exception:
753
+ pass
754
+
755
+ for j, fr_rgb_small in enumerate(frames_small_rgb):
756
+ try:
757
+ if j == 0:
758
+ m2d = mask_small
759
+ if m2d.ndim == 3:
760
+ m2d = m2d[..., 0]
761
+ alpha_small = matanyone(fr_rgb_small, m2d) # adapter returns float32 [h,w]
762
+ else:
763
+ alpha_small = matanyone(fr_rgb_small) # propagate (no mask)
764
+
765
+ # Stabilize + harden at model scale
766
+ alpha_small = np.clip(alpha_small.astype(np.float32), 0.0, 1.0)
767
+ alpha_stable = self._stabilize(alpha_small)
768
+ alpha_harden = self._harden(alpha_stable)
769
+
770
+ # Upsample back to full-res
771
+ if scale != 1.0:
772
+ H, W = frames_bgr[j].shape[:2]
773
+ alpha_full = cv2.resize(alpha_harden, (W, H), interpolation=cv2.INTER_LINEAR)
774
+ else:
775
+ alpha_full = alpha_harden
776
+
777
+ # Composite at full-res (expects RGB)
778
+ frame_rgb_full = cv2.cvtColor(frames_bgr[j], cv2.COLOR_BGR2RGB)
779
+ out_rgb = replace_background_hq(frame_rgb_full, alpha_full, background_rgb)
780
+ out_bgr = cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
781
+ out_bgr = np.ascontiguousarray(out_bgr)
782
+
783
+ # Write
784
+ if ffmpeg_pipe is not None:
785
+ try:
786
+ ffmpeg_pipe.write(out_bgr)
787
+ except Exception as e:
788
+ self.log.warning("Switching to OpenCV writer after FFmpeg error at frame %d: %s", frame_count, e)
789
+ try:
790
+ ffmpeg_pipe.close()
791
+ except Exception:
792
+ pass
793
+ ffmpeg_pipe = None
794
+ if writer is None:
795
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
796
+ writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
797
+ if not writer.isOpened():
798
+ raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
799
+ writer.write(out_bgr)
800
+ else:
801
+ writer.write(out_bgr)
802
+
803
+ frame_count += 1
804
+
805
+ except Exception as e:
806
+ # If MatAnyone fails, log and fall back to SAM-only for this frame
807
+ self.log.warning(f"MatAnyone failed at window frame {j}: {e}")
808
+ if j == 0:
809
+ alpha_small_fb = np.clip(mask_small.astype(np.float32), 0.0, 1.0)
810
+ else:
811
+ alpha_small_fb = self._prev_mask if self._prev_mask is not None else np.zeros_like(alpha_small, dtype=np.float32)
812
+
813
+ if scale != 1.0:
814
+ H, W = frames_bgr[j].shape[:2]
815
+ alpha_full_fb = cv2.resize(alpha_small_fb, (W, H), interpolation=cv2.INTER_LINEAR)
816
+ else:
817
+ alpha_full_fb = alpha_small_fb
818
+
819
+ frame_rgb_full = cv2.cvtColor(frames_bgr[j], cv2.COLOR_BGR2RGB)
820
+ out_rgb_fb = replace_background_hq(frame_rgb_full, alpha_full_fb, background_rgb)
821
+ out_bgr_fb = cv2.cvtColor(out_rgb_fb, cv2.COLOR_RGB2BGR)
822
+
823
+ if ffmpeg_pipe is not None:
824
+ try:
825
+ ffmpeg_pipe.write(np.ascontiguousarray(out_bgr_fb))
826
+ except Exception:
827
+ try:
828
+ ffmpeg_pipe.close()
829
+ except Exception:
830
+ pass
831
+ ffmpeg_pipe = None
832
+ if writer is None:
833
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
834
+ writer = cv2.VideoWriter(output_path, fourcc, float(fps_out), (width, height))
835
+ if not writer.isOpened():
836
+ raise RuntimeError(f"FFmpeg failed and VideoWriter could not open: {output_path}")
837
+ writer.write(np.ascontiguousarray(out_bgr_fb))
838
+ else:
839
+ writer.write(np.ascontiguousarray(out_bgr_fb))
840
+ frame_count += 1
841
+
842
+ # Progress update
843
+ if progress_callback:
844
+ elapsed = time.time() - start_time
845
+ fps_live = frame_count / elapsed if elapsed > 0 else 0.0
846
+ try:
847
+ progress_callback(frame_count, total_frames, fps_live)
848
+ except Exception:
849
+ pass
850
+
851
+ # Clean per-window buffers (CPU) and let CUDA defrag
852
+ del frames_bgr, frames_small_bgr, frames_small_rgb, mask_small
853
+ try:
854
+ import torch
855
+ if torch.cuda.is_available():
856
+ torch.cuda.empty_cache()
857
+ except Exception:
858
+ pass
859
+
860
+ finally:
861
+ cap.release()
862
+ if writer is not None:
863
+ writer.release()
864
+ if ffmpeg_pipe is not None:
865
+ try:
866
+ ffmpeg_pipe.close()
867
+ except Exception:
868
+ pass
869
+
870
+ if ffmpeg_failed_reason:
871
+ self.log.info("Completed via OpenCV writer (FFmpeg initially failed): %s", ffmpeg_failed_reason)
872
+
873
+ self.log.info("Processed %d frames → %s", frame_count, output_path)
874
+ return {
875
+ "frames": frame_count,
876
+ "width": width,
877
+ "height": height,
878
+ "fps_out": float(fps_out),
879
+ "output_path": output_path,
880
+ }
881
+
882
+
883
+ # Backward-compat alias used elsewhere
884
+ VideoProcessor = CoreVideoProcessor