solo-depth-any-video

Sleeping

File size: 8,952 Bytes

a1739b9

import torch
import tqdm
import numpy as np
from diffusers import DiffusionPipeline
from diffusers.utils import BaseOutput
import matplotlib


def colorize_depth(depth, cmap="Spectral"):
    # colorize
    cm = matplotlib.colormaps[cmap]
    # (B, N, H, W, 3)
    depth_colored = cm(depth, bytes=False)[..., 0:3]  # value from 0 to 1
    return depth_colored


class DAVOutput(BaseOutput):
    r"""
    Output class for zero-shot text-to-video pipeline.

    Args:
        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
            num_channels)`.
    """

    disparity: np.ndarray
    disparity_colored: np.ndarray
    image: np.ndarray


class DAVPipeline(DiffusionPipeline):
    def __init__(self, vae, unet, unet_interp, scheduler):
        super().__init__()
        self.register_modules(
            vae=vae, unet=unet, unet_interp=unet_interp, scheduler=scheduler
        )

    def encode(self, input):
        num_frames = input.shape[1]
        input = input.flatten(0, 1)
        latent = self.vae.encode(input.to(self.vae.dtype)).latent_dist.mode()
        latent = latent * self.vae.config.scaling_factor
        latent = latent.reshape(-1, num_frames, *latent.shape[1:])
        return latent

    def decode(self, latents, decode_chunk_size=16):
        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
        num_frames = latents.shape[1]
        latents = latents.flatten(0, 1)
        latents = latents / self.vae.config.scaling_factor

        # decode decode_chunk_size frames at a time to avoid OOM
        frames = []
        for i in range(0, latents.shape[0], decode_chunk_size):
            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
            frame = self.vae.decode(
                latents[i : i + decode_chunk_size].to(self.vae.dtype),
                num_frames=num_frames_in,
            ).sample
            frames.append(frame)
        frames = torch.cat(frames, dim=0)

        # [batch, frames, channels, height, width]
        frames = frames.reshape(-1, num_frames, *frames.shape[1:])
        return frames.to(torch.float32)

    def single_infer(self, rgb, position_ids=None, num_inference_steps=None):
        rgb_latent = self.encode(rgb)
        noise_latent = torch.randn_like(rgb_latent)

        self.scheduler.set_timesteps(num_inference_steps, device=rgb.device)
        timesteps = self.scheduler.timesteps

        image_embeddings = torch.zeros((noise_latent.shape[0], 1, 1024)).to(
            noise_latent
        )

        for i, t in enumerate(timesteps):
            latent_model_input = noise_latent

            latent_model_input = torch.cat([latent_model_input, rgb_latent], dim=2)

            # [batch_size, num_frame, 4, h, w]
            model_output = self.unet(
                latent_model_input,
                t,
                encoder_hidden_states=image_embeddings,
                position_ids=position_ids,
            ).sample

            # compute the previous noisy sample x_t -> x_t-1
            noise_latent = self.scheduler.step(
                model_output, t, noise_latent
            ).prev_sample

        return noise_latent

    def single_interp_infer(
        self, rgb, masked_depth_latent, mask, num_inference_steps=None
    ):
        rgb_latent = self.encode(rgb)
        noise_latent = torch.randn_like(rgb_latent)

        self.scheduler.set_timesteps(num_inference_steps, device=rgb.device)
        timesteps = self.scheduler.timesteps

        image_embeddings = torch.zeros((noise_latent.shape[0], 1, 1024)).to(
            noise_latent
        )

        for i, t in enumerate(timesteps):
            latent_model_input = noise_latent

            latent_model_input = torch.cat(
                [latent_model_input, rgb_latent, masked_depth_latent, mask], dim=2
            )

            # [batch_size, num_frame, 4, h, w]
            model_output = self.unet_interp(
                latent_model_input, t, encoder_hidden_states=image_embeddings
            ).sample

            # compute the previous noisy sample x_t -> x_t-1
            noise_latent = self.scheduler.step(
                model_output, t, noise_latent
            ).prev_sample

        return noise_latent

    def __call__(
        self,
        image,
        num_frames,
        num_overlap_frames,
        num_interp_frames,
        decode_chunk_size,
        num_inference_steps,
    ):
        self.vae.to(dtype=torch.float16)

        # (1, N, 3, H, W)
        image = image.unsqueeze(0)
        B, N = image.shape[:2]
        rgb = image * 2 - 1  # [-1, 1]

        if N <= num_frames or N <= num_interp_frames + 2 - num_overlap_frames:
            depth_latent = self.single_infer(
                rgb, num_inference_steps=num_inference_steps
            )
        else:
            assert 2 <= num_overlap_frames <= (num_interp_frames + 2 + 1) // 2
            assert num_frames % 2 == 0

            key_frame_indices = []
            for i in range(0, N, num_interp_frames + 2 - num_overlap_frames):
                if (
                    i + num_interp_frames + 1 >= N
                    or len(key_frame_indices) >= num_frames
                ):
                    break
                key_frame_indices.append(i)
                key_frame_indices.append(i + num_interp_frames + 1)

            key_frame_indices = torch.tensor(key_frame_indices, device=rgb.device)

            sorted_key_frame_indices, origin_indices = torch.sort(key_frame_indices)
            key_rgb = rgb[:, sorted_key_frame_indices]
            key_depth_latent = self.single_infer(
                key_rgb,
                sorted_key_frame_indices.unsqueeze(0).repeat(B, 1),
                num_inference_steps=num_inference_steps,
            )
            key_depth_latent = key_depth_latent[:, origin_indices]

            torch.cuda.empty_cache()

            depth_latent = []
            pre_latent = None
            for i in tqdm.tqdm(range(0, len(key_frame_indices), 2)):
                frame1 = key_depth_latent[:, i]
                frame2 = key_depth_latent[:, i + 1]
                masked_depth_latent = torch.zeros(
                    (B, num_interp_frames + 2, *key_depth_latent.shape[2:])
                ).to(key_depth_latent)
                masked_depth_latent[:, 0] = frame1
                masked_depth_latent[:, -1] = frame2

                mask = torch.zeros_like(masked_depth_latent)
                mask[:, [0, -1]] = 1.0

                latent = self.single_interp_infer(
                    rgb[:, key_frame_indices[i] : key_frame_indices[i + 1] + 1],
                    masked_depth_latent,
                    mask,
                    num_inference_steps=num_inference_steps,
                )
                latent = latent[:, 1:-1]

                if pre_latent is not None:
                    overlap_a = pre_latent[
                        :, pre_latent.shape[1] - (num_overlap_frames - 2) :
                    ]
                    overlap_b = latent[:, : (num_overlap_frames - 2)]
                    ratio = (
                        torch.linspace(0, 1, num_overlap_frames - 2)
                        .to(overlap_a)
                        .view(1, -1, 1, 1, 1)
                    )
                    overlap = overlap_a * (1 - ratio) + overlap_b * ratio
                    pre_latent[:, pre_latent.shape[1] - (num_overlap_frames - 2) :] = (
                        overlap
                    )
                    depth_latent.append(pre_latent)

                pre_latent = latent[:, (num_overlap_frames - 2) if i > 0 else 0 :]

                torch.cuda.empty_cache()

            depth_latent.append(pre_latent)
            depth_latent = torch.cat(depth_latent, dim=1)

            # dicard the first and last key frames
            image = image[:, key_frame_indices[0] + 1 : key_frame_indices[-1]]
            assert depth_latent.shape[1] == image.shape[1]

        disparity = self.decode(depth_latent, decode_chunk_size=decode_chunk_size)
        disparity = disparity.mean(dim=2, keepdim=False)
        disparity = torch.clamp(disparity * 0.5 + 0.5, 0.0, 1.0)

        # (N, H, W)
        disparity = disparity.squeeze(0)
        # (N, H, W, 3)
        mid_d, max_d = disparity.min(), disparity.max()
        disparity_colored = torch.clamp((max_d - disparity) / (max_d - mid_d), 0.0, 1.0)
        disparity_colored = colorize_depth(disparity_colored.cpu().numpy())
        disparity_colored = (disparity_colored * 255).astype(np.uint8)
        image = image.squeeze(0).permute(0, 2, 3, 1).cpu().numpy()
        image = (image * 255).astype(np.uint8)
        disparity = disparity.cpu().numpy()

        return DAVOutput(
            disparity=disparity,
            disparity_colored=disparity_colored,
            image=image,
        )