netflix
/

void-model

+"""
+VOID (Video Object and Interaction Deletion) Pipeline.
+Simple usage:
+    from pipeline_void import VOIDPipeline
+    pipe = VOIDPipeline.from_pretrained("netflix/void-model")
+    result = pipe.inpaint("input.mp4", "quadmask.mp4", "A lime falls on the table.")
+    result.save("output.mp4")
+Pass 2 refinement:
+    pipe2 = VOIDPipeline.from_pretrained("netflix/void-model", void_pass=2)
+    result2 = pipe2.inpaint("input.mp4", "quadmask.mp4", "A lime falls on the table.",
+                            pass1_video="output.mp4")
+    result2.save("output_refined.mp4")
+"""
+import os
+import json
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors.torch import load_file
+from diffusers import CogVideoXDDIMScheduler
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from cogvideox_transformer3d import CogVideoXTransformer3DModel
+from cogvideox_vae import AutoencoderKLCogVideoX
+from pipeline_cogvideox_fun_inpaint import CogVideoXFunInpaintPipeline
+# The base model that VOID is fine-tuned from
+BASE_MODEL_REPO = "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP"
+# Checkpoint filenames in the VOID repo
+PASS_CHECKPOINTS = {
+    1: "void_pass1.safetensors",
+    2: "void_pass2.safetensors",
+}
+# Default negative prompt (from config/quadmask_cogvideox.py)
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video is not of a high quality, it has a low resolution. "
+    "Watermark present in each frame. The background is solid. "
+    "Strange body and strange trajectory. Distortion. "
+)
+@dataclass
+class VOIDOutput:
+    """Output from VOID pipeline."""
+    video: torch.Tensor  # (T, H, W, 3) uint8
+    video_float: torch.Tensor  # (1, C, T, H, W) float [0, 1]
+    def save(self, path: str, fps: int = 12):
+        """Save output video to file."""
+        import imageio
+        frames = [f for f in self.video.cpu().numpy()]
+        imageio.mimwrite(path, frames, fps=fps)
+        print(f"Saved {len(frames)} frames to {path}")
+def _merge_void_weights(transformer, checkpoint_path):
+    """Merge VOID checkpoint into base transformer, handling channel mismatch."""
+    state_dict = load_file(checkpoint_path)
+    param_name = "patch_embed.proj.weight"
+    if state_dict[param_name].size(1) != transformer.state_dict()[param_name].size(1):
+        latent_ch = 16
+        feat_scale = 8
+        feat_dim = int(latent_ch * feat_scale)
+        new_weight = transformer.state_dict()[param_name].clone()
+        new_weight[:, :feat_dim] = state_dict[param_name][:, :feat_dim]
+        new_weight[:, -feat_dim:] = state_dict[param_name][:, -feat_dim:]
+        state_dict[param_name] = new_weight
+    m, u = transformer.load_state_dict(state_dict, strict=False)
+    if m:
+        print(f"[VOID] Missing keys: {len(m)}")
+    if u:
+        print(f"[VOID] Unexpected keys: {len(u)}")
+    return transformer
+def _load_video(path: str, max_frames: int) -> np.ndarray:
+    """Load video as numpy array (T, H, W, 3) uint8."""
+    import imageio
+    frames = list(imageio.imiter(path))
+    frames = frames[:max_frames]
+    return np.array(frames)
+def _prep_video_tensor(
+    video_np: np.ndarray,
+    sample_size: Tuple[int, int],
+) -> torch.Tensor:
+    """Convert video numpy array to pipeline input tensor.
+    Returns: (1, C, T, H, W) float32 in [0, 1]
+    """
+    video = torch.from_numpy(video_np).float()
+    video = video.permute(3, 0, 1, 2) / 255.0  # (C, T, H, W)
+    video = F.interpolate(video, sample_size, mode="area")
+    return video.unsqueeze(0)  # (1, C, T, H, W)
+def _prep_mask_tensor(
+    mask_np: np.ndarray,
+    sample_size: Tuple[int, int],
+    use_quadmask: bool = True,
+) -> torch.Tensor:
+    """Convert mask numpy array to pipeline input tensor.
+    Quantizes to quadmask values [0, 63, 127, 255], inverts,
+    and normalizes to [0, 1].
+    Returns: (1, 1, T, H, W) float32 in [0, 1]
+    """
+    mask = torch.from_numpy(mask_np).float()
+    if mask.ndim == 4:
+        mask = mask[..., 0]  # drop channel dim -> (T, H, W)
+    mask = F.interpolate(mask.unsqueeze(0), sample_size, mode="area")
+    mask = mask.unsqueeze(0)  # (1, 1, T, H, W)
+    if use_quadmask:
+        # Quantize to 4 values
+        mask = torch.where(mask <= 31, 0., mask)
+        mask = torch.where((mask > 31) * (mask <= 95), 63., mask)
+        mask = torch.where((mask > 95) * (mask <= 191), 127., mask)
+        mask = torch.where(mask > 191, 255., mask)
+    else:
+        # Trimask: 3 values
+        mask = torch.where(mask > 192, 255., mask)
+        mask = torch.where((mask <= 192) * (mask >= 64), 128., mask)
+        mask = torch.where(mask < 64, 0., mask)
+    # Invert and normalize to [0, 1]
+    mask = (255. - mask) / 255.
+    return mask
+def _temporal_padding(
+    tensor: torch.Tensor,
+    min_length: int = 85,
+    max_length: int = 197,
+    dim: int = 2,
+) -> torch.Tensor:
+    """Pad video temporally by mirroring, matching CogVideoX requirements."""
+    length = tensor.size(dim)
+    min_len = (length // 4) * 4 + 1
+    if min_len < length:
+        min_len += 4
+    if (min_len / 4) % 2 == 0:
+        min_len += 4
+    target_length = min(min_len, max_length)
+    target_length = max(min_length, target_length)
+    # Truncate if needed
+    if dim == 2:
+        tensor = tensor[:, :, :target_length]
+    else:
+        raise NotImplementedError(f"dim={dim} not supported")
+    # Pad by mirroring
+    while tensor.size(dim) < target_length:
+        flipped = torch.flip(tensor, [dim])
+        tensor = torch.cat([tensor, flipped], dim=dim)
+    if dim == 2:
+        tensor = tensor[:, :, :target_length]
+    return tensor
+def _generate_warped_noise(
+    pass1_video_path: str,
+    target_shape: Tuple[int, int, int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Generate warped noise from Pass 1 output video.
+    Args:
+        pass1_video_path: Path to Pass 1 output video.
+        target_shape: (latent_T, latent_H, latent_W, latent_C)
+        device: Target device.
+        dtype: Target dtype.
+    Returns: (1, T, C, H, W) warped noise tensor.
+    """
+    # Try to import rp and nw for direct warped noise generation
+    try:
+        # Fix for SLURM: rp crashes parsing GPU UUIDs like "GPU-9fca2b4f-..."
+        # Set CUDA_VISIBLE_DEVICES to numeric index if it contains UUIDs
+        cuda_env = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+        if cuda_env and not cuda_env.replace(",", "").isdigit():
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+        import rp
+        rp.r._pip_import_autoyes = True
+        rp.git_import('CommonSource')
+        import rp.git.CommonSource.noise_warp as nw
+        return _generate_warped_noise_direct(pass1_video_path, target_shape, device, dtype)
+    except ImportError as e:
+        print(f"[VOID] rp/noise_warp not available: {e}")
+    except Exception as e:
+        print(f"[VOID] Warped noise generation via rp failed: {e}")
+        import traceback
+        traceback.print_exc()
+    # Fallback: try to find and run make_warped_noise.py as subprocess
+    script_candidates = [
+        os.path.join(os.path.dirname(__file__), "make_warped_noise.py"),
+        os.path.join(os.path.dirname(__file__), "..", "inference", "cogvideox_fun", "make_warped_noise.py"),
+    ]
+    gwf_script = None
+    for candidate in script_candidates:
+        if os.path.exists(candidate):
+            gwf_script = candidate
+            break
+    if gwf_script is None:
+        raise RuntimeError(
+            "Cannot generate warped noise: 'rp' package not installed and "
+            "make_warped_noise.py not found. Install 'rp' package or provide "
+            "pre-computed warped noise via warped_noise_path parameter."
+        )
+    with tempfile.TemporaryDirectory() as tmpdir:
+        cmd = [sys.executable, gwf_script, os.path.abspath(pass1_video_path), tmpdir]
+        print(f"[VOID] Generating warped noise (this may take a few minutes)...")
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+        if result.returncode != 0:
+            raise RuntimeError(f"Warped noise generation failed:\n{result.stderr}")
+        # Find the output noises.npy
+        video_stem = os.path.splitext(os.path.basename(pass1_video_path))[0]
+        noise_path = os.path.join(tmpdir, video_stem, "noises.npy")
+        if not os.path.exists(noise_path):
+            # Try flat path
+            noise_path = os.path.join(tmpdir, "noises.npy")
+        if not os.path.exists(noise_path):
+            raise RuntimeError(f"Warped noise file not found after generation")
+        return _load_warped_noise(noise_path, target_shape, device, dtype)
+def _generate_warped_noise_direct(
+    video_path: str,
+    target_shape: Tuple[int, int, int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Generate warped noise directly using rp package."""
+    import rp
+    import rp.git.CommonSource.noise_warp as nw
+    video = rp.load_video(video_path)
+    video = rp.resize_list(video, length=72)
+    video = rp.resize_images_to_hold(video, height=480, width=720)
+    video = rp.crop_images(video, height=480, width=720, origin='center')
+    video = rp.as_numpy_array(video)
+    FRAME = 2**-1
+    FLOW = 2**3
+    LATENT = 8
+    output = nw.get_noise_from_video(
+        video,
+        remove_background=False,
+        visualize=False,
+        save_files=False,
+        noise_channels=16,
+        resize_frames=FRAME,
+        resize_flow=FLOW,
+        downscale_factor=round(FRAME * FLOW) * LATENT,
+    )
+    noises = output.numpy_noises  # (T, H, W, C)
+    return _numpy_noise_to_tensor(noises, target_shape, device, dtype)
+def _load_warped_noise(
+    noise_path: str,
+    target_shape: Tuple[int, int, int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Load and resize pre-computed warped noise."""
+    noises = np.load(noise_path)
+    if noises.dtype == np.float16:
+        noises = noises.astype(np.float32)
+    # Ensure THWC format
+    if noises.shape[1] == 16:  # TCHW -> THWC
+        noises = np.transpose(noises, (0, 2, 3, 1))
+    return _numpy_noise_to_tensor(noises, target_shape, device, dtype)
+def _numpy_noise_to_tensor(
+    noises: np.ndarray,
+    target_shape: Tuple[int, int, int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Convert numpy noise (T, H, W, C) to pipeline tensor (1, T, C, H, W)."""
+    latent_T, latent_H, latent_W, latent_C = target_shape
+    # Temporal resize if needed
+    if noises.shape[0] != latent_T:
+        indices = np.linspace(0, noises.shape[0] - 1, latent_T)
+        lower = np.floor(indices).astype(int)
+        upper = np.ceil(indices).astype(int)
+        frac = indices - lower
+        noises = noises[lower] * (1 - frac[:, None, None, None]) + noises[upper] * frac[:, None, None, None]
+    # Spatial resize if needed
+    if noises.shape[1] != latent_H or noises.shape[2] != latent_W:
+        resized = np.zeros((latent_T, latent_H, latent_W, latent_C), dtype=noises.dtype)
+        for t in range(latent_T):
+            for c in range(latent_C):
+                resized[t, :, :, c] = cv2.resize(
+                    noises[t, :, :, c], (latent_W, latent_H),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+        noises = resized
+    # Convert to tensor: (T, H, W, C) -> (1, T, C, H, W)
+    tensor = torch.from_numpy(noises).permute(0, 3, 1, 2).unsqueeze(0)
+    return tensor.to(device=device, dtype=dtype)
+class VOIDPipeline(CogVideoXFunInpaintPipeline):
+    """
+    VOID: Video Object and Interaction Deletion.
+    Removes objects and their physical interactions from videos using
+    quadmask-conditioned video inpainting.
+    """
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        void_pass: int = 1,
+        base_model: str = BASE_MODEL_REPO,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        **kwargs,
+    ):
+        """
+        Load the VOID pipeline.
+        Args:
+            pretrained_model_name_or_path: HF repo ID or local path containing
+                VOID checkpoint files (void_pass1.safetensors, etc.)
+            void_pass: Which pass checkpoint to load (1 or 2). Default: 1.
+            base_model: HF repo ID for the base CogVideoX-Fun model.
+            torch_dtype: Weight dtype. Default: torch.bfloat16.
+        """
+        if void_pass not in PASS_CHECKPOINTS:
+            raise ValueError(f"void_pass must be 1 or 2, got {void_pass}")
+        # --- Download VOID checkpoint ---
+        checkpoint_name = PASS_CHECKPOINTS[void_pass]
+        print(f"[VOID] Loading Pass {void_pass} checkpoint...")
+        if os.path.isdir(pretrained_model_name_or_path):
+            checkpoint_path = os.path.join(pretrained_model_name_or_path, checkpoint_name)
+        else:
+            checkpoint_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename=checkpoint_name,
+            )
+        # --- Download and load base model ---
+        print(f"[VOID] Loading base model: {base_model}")
+        base_model_path = snapshot_download(repo_id=base_model)
+        # Transformer (with VAE mask channels)
+        print("[VOID] Loading transformer...")
+        transformer = CogVideoXTransformer3DModel.from_pretrained(
+            base_model_path,
+            subfolder="transformer",
+            low_cpu_mem_usage=True,
+            torch_dtype=torch_dtype,
+            use_vae_mask=True,
+        )
+        # Merge VOID weights
+        print(f"[VOID] Merging Pass {void_pass} weights...")
+        transformer = _merge_void_weights(transformer, checkpoint_path)
+        transformer = transformer.to(torch_dtype)
+        # VAE
+        print("[VOID] Loading VAE...")
+        vae = AutoencoderKLCogVideoX.from_pretrained(
+            base_model_path, subfolder="vae"
+        ).to(torch_dtype)
+        # Tokenizer + Text encoder
+        print("[VOID] Loading tokenizer and text encoder...")
+        from transformers import T5Tokenizer, T5EncoderModel
+        tokenizer = T5Tokenizer.from_pretrained(base_model_path, subfolder="tokenizer")
+        text_encoder = T5EncoderModel.from_pretrained(
+            base_model_path, subfolder="text_encoder", torch_dtype=torch_dtype,
+        )
+        # Scheduler
+        scheduler = CogVideoXDDIMScheduler.from_pretrained(
+            base_model_path, subfolder="scheduler"
+        )
+        # Build pipeline
+        pipe = cls(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        pipe._void_pass = void_pass
+        print("[VOID] Pipeline ready!")
+        return pipe
+    def inpaint(
+        self,
+        video_path: str,
+        mask_path: str,
+        prompt: str,
+        negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
+        height: int = 384,
+        width: int = 672,
+        num_inference_steps: int = 30,
+        guidance_scale: float = 1.0,
+        strength: float = 1.0,
+        temporal_window_size: int = 85,
+        max_video_length: int = 197,
+        fps: int = 12,
+        seed: int = 42,
+        pass1_video: Optional[str] = None,
+        warped_noise_path: Optional[str] = None,
+        use_quadmask: bool = True,
+    ) -> VOIDOutput:
+        """
+        Run VOID inpainting on a video.
+        Args:
+            video_path: Path to input video (mp4).
+            mask_path: Path to quadmask video (mp4). Grayscale with values:
+                0=object to remove, 63=overlap, 127=affected region, 255=background.
+            prompt: Text description of the desired result after removal.
+                E.g., "A lime falls on the table."
+            negative_prompt: Negative prompt for generation quality.
+            height: Output height (default 384).
+            width: Output width (default 672).
+            num_inference_steps: Denoising steps (default 30).
+            guidance_scale: CFG scale (default 1.0 = no CFG).
+            strength: Denoising strength (default 1.0).
+            temporal_window_size: Frames per inference window (default 85).
+            max_video_length: Max frames to process (default 197).
+            fps: Output FPS (default 12).
+            seed: Random seed (default 42).
+            pass1_video: Path to Pass 1 output video, for Pass 2 warped noise init.
+            warped_noise_path: Path to pre-computed warped noise (.npy).
+            use_quadmask: Use 4-value quadmask (default True). Set False for trimask.
+        Returns:
+            VOIDOutput with .video (uint8) and .save() method.
+        """
+        sample_size = (height, width)
+        # Align video length to VAE temporal compression ratio
+        vae_temporal_ratio = self.vae.config.temporal_compression_ratio
+        video_length = int((max_video_length - 1) // vae_temporal_ratio * vae_temporal_ratio) + 1
+        # --- Load and prep video ---
+        print("[VOID] Loading video and mask...")
+        vid_np = _load_video(video_path, video_length)
+        mask_np = _load_video(mask_path, video_length)
+        video = _prep_video_tensor(vid_np, sample_size)
+        mask = _prep_mask_tensor(mask_np, sample_size, use_quadmask=use_quadmask)
+        # Temporal padding
+        video = _temporal_padding(video, min_length=temporal_window_size, max_length=max_video_length)
+        mask = _temporal_padding(mask, min_length=temporal_window_size, max_length=max_video_length)
+        num_frames = min(video.shape[2], temporal_window_size)
+        print(f"[VOID] Video: {video.shape}, Mask: {mask.shape}, Frames: {num_frames}")
+        # --- Handle warped noise for Pass 2 ---
+        latents = None
+        if warped_noise_path is not None or pass1_video is not None:
+            latent_T = (num_frames - 1) // 4 + 1
+            latent_H = height // 8
+            latent_W = width // 8
+            latent_C = 16
+            target_shape = (latent_T, latent_H, latent_W, latent_C)
+            if warped_noise_path is not None:
+                print(f"[VOID] Loading pre-computed warped noise from {warped_noise_path}")
+                latents = _load_warped_noise(
+                    warped_noise_path, target_shape,
+                    device=torch.device("cpu"), dtype=torch.bfloat16,
+                )
+            else:
+                print(f"[VOID] Generating warped noise from Pass 1 output...")
+                latents = _generate_warped_noise(
+                    pass1_video, target_shape,
+                    device=torch.device("cpu"), dtype=torch.bfloat16,
+                )
+            print(f"[VOID] Warped noise: {latents.shape}, mean={latents.mean():.4f}, std={latents.std():.4f}")
+        # --- Run inference ---
+        generator = torch.Generator(device="cpu").manual_seed(seed)
+        print(f"[VOID] Running inference ({num_frames} frames, {num_inference_steps} steps)...")
+        with torch.no_grad():
+            output = self(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_inference_steps,
+                generator=generator,
+                video=video,
+                mask_video=mask,
+                strength=strength,
+                use_trimask=True,
+                use_vae_mask=True,
+                latents=latents,
+            ).videos
+        # --- Process output ---
+        if isinstance(output, np.ndarray):
+            output = torch.from_numpy(output)
+        # output is (B, C, T, H, W) in [0, 1]
+        video_float = output
+        video_uint8 = (output[0].permute(1, 2, 3, 0).clamp(0, 1) * 255).to(torch.uint8)
+        print(f"[VOID] Done! Output: {video_uint8.shape}")
+        return VOIDOutput(video=video_uint8, video_float=video_float)