Spaces:

guangkaixu
/

GenPercept

Running on Zero

File size: 12,765 Bytes

# --------------------------------------------------------
# Diffusion Models Trained with Large Data Are Transferable Visual Models (https://arxiv.org/abs/2403.06090)
# Github source: https://github.com/aim-uofa/GenPercept
# Copyright (c) 2024 Zhejiang University
# Licensed under The CC0 1.0 License [see LICENSE for details]
# By Guangkai Xu
# Based on Marigold, diffusers codebases
# https://github.com/prs-eth/marigold
# https://github.com/huggingface/diffusers
# --------------------------------------------------------

import torch
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from PIL import Image
from typing import List, Dict, Union
from torch.utils.data import DataLoader, TensorDataset

from diffusers import (
    DiffusionPipeline,
    UNet2DConditionModel,
    AutoencoderKL,
)
from diffusers.utils import BaseOutput

from util.image_util import chw2hwc, colorize_depth_maps, resize_max_res, norm_to_rgb, resize_res
from util.batchsize import find_batch_size

class GenPerceptOutput(BaseOutput):

    pred_np: np.ndarray
    pred_colored: Image.Image

class GenPerceptPipeline(DiffusionPipeline):

    vae_scale_factor = 0.18215
    task_infos = {
        'depth':    dict(task_channel_num=1, interpolate='bilinear', ),
        'seg':      dict(task_channel_num=3, interpolate='nearest', ),
        'sr':       dict(task_channel_num=3, interpolate='nearest', ),
        'normal':   dict(task_channel_num=3, interpolate='bilinear', ),
    }

    def __init__(
        self,
        unet: UNet2DConditionModel,
        vae: AutoencoderKL,
        customized_head=None,
        empty_text_embed=None,
    ):
        super().__init__()

        self.empty_text_embed = empty_text_embed

        # register
        register_dict = dict(
            unet=unet,
            vae=vae,
            customized_head=customized_head,
        )
        self.register_modules(**register_dict)
    
    @torch.no_grad()
    def __call__(
        self,
        input_image: Union[Image.Image, torch.Tensor],
        mode: str = 'depth',
        resize_hard = False,
        processing_res: int = 768,
        match_input_res: bool = True,
        batch_size: int = 0,
        color_map: str = "Spectral",
        show_progress_bar: bool = True,
    ) -> GenPerceptOutput:
        """
        Function invoked when calling the pipeline.

        Args:
            input_image (Image):
                Input RGB (or gray-scale) image.
            processing_res (int, optional):
                Maximum resolution of processing.
                If set to 0: will not resize at all.
                Defaults to 768.
            match_input_res (bool, optional):
                Resize depth prediction to match input resolution.
                Only valid if `limit_input_res` is not None.
                Defaults to True.
            batch_size (int, optional):
                Inference batch size.
                If set to 0, the script will automatically decide the proper batch size.
                Defaults to 0.
            show_progress_bar (bool, optional):
                Display a progress bar of diffusion denoising.
                Defaults to True.
            color_map (str, optional):
                Colormap used to colorize the depth map.
                Defaults to "Spectral".
        Returns:
            `GenPerceptOutput`
        """

        device = self.device

        task_channel_num = self.task_infos[mode]['task_channel_num']

        if not match_input_res:
            assert (
                processing_res is not None
            ), "Value error: `resize_output_back` is only valid with "
        assert processing_res >= 0

        # ----------------- Image Preprocess -----------------

        if type(input_image) == torch.Tensor: # [B, 3, H, W]            
            rgb_norm = input_image.to(device)
            input_size = input_image.shape[2:]
            bs_imgs = rgb_norm.shape[0]
            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
            rgb_norm = rgb_norm.to(self.dtype)
        else:
            # if len(rgb_paths) > 0 and 'kitti' in rgb_paths[0]:
            #     # kb crop
            #     height = input_image.size[1]
            #     width = input_image.size[0]
            #     top_margin = int(height - 352)
            #     left_margin = int((width - 1216) / 2)
            #     input_image = input_image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))

            # TODO: check the kitti evaluation resolution here.
            input_size = (input_image.size[1], input_image.size[0])
            # Resize image
            if processing_res > 0:
                if resize_hard:
                    input_image = resize_res(
                        input_image, max_edge_resolution=processing_res
                    )
                else:
                    input_image = resize_max_res(
                        input_image, max_edge_resolution=processing_res
                    )
            input_image = input_image.convert("RGB")
            image = np.asarray(input_image)

            # Normalize rgb values
            rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
            rgb_norm = rgb / 255.0 * 2.0 - 1.0
            rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
            rgb_norm = rgb_norm[None].to(device)
            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
            bs_imgs = 1

        # ----------------- Predicting depth -----------------

        single_rgb_dataset = TensorDataset(rgb_norm)
        if batch_size > 0:
            _bs = batch_size
        else:
            _bs = find_batch_size(
                ensemble_size=1, 
                input_res=max(rgb_norm.shape[1:]),
                dtype=self.dtype,
            )

        single_rgb_loader = DataLoader(
            single_rgb_dataset, batch_size=_bs, shuffle=False
        )

        # Predict depth maps (batched)
        pred_list = []
        if show_progress_bar:
            iterable = tqdm(
                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
            )
        else:
            iterable = single_rgb_loader
        
        for batch in iterable:
            (batched_img, ) = batch
            pred = self.single_infer(
                rgb_in=batched_img,
                mode=mode,
            )
            pred_list.append(pred.detach().clone())
        preds = torch.concat(pred_list, axis=0).squeeze() # [bs_imgs, task_channel_num, H, W]
        preds = preds.view(bs_imgs, task_channel_num, preds.shape[-2], preds.shape[-1])
        
        if match_input_res:
            preds = F.interpolate(preds, input_size, mode=self.task_infos[mode]['interpolate'])

        # ----------------- Post processing -----------------
        if mode == 'depth':
            if len(preds.shape) == 4:
                preds = preds[:, 0] # [bs_imgs, H, W]
            # Scale prediction to [0, 1]
            min_d = preds.view(bs_imgs, -1).min(dim=1)[0]
            max_d = preds.view(bs_imgs, -1).max(dim=1)[0]
            preds = (preds - min_d[:, None, None]) / (max_d[:, None, None] - min_d[:, None, None])
            preds = preds.cpu().numpy().astype(np.float32)
            # Colorize
            pred_colored_img_list = []
            for i in range(bs_imgs):
                pred_colored_chw = colorize_depth_maps(
                    preds[i], 0, 1, cmap=color_map
                ).squeeze()  # [3, H, W], value in (0, 1)
                pred_colored_chw = (pred_colored_chw * 255).astype(np.uint8)
                pred_colored_hwc = chw2hwc(pred_colored_chw)
                pred_colored_img = Image.fromarray(pred_colored_hwc)
                pred_colored_img_list.append(pred_colored_img)

            return GenPerceptOutput(
                pred_np=np.squeeze(preds),
                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
            )    

        elif mode == 'seg' or mode == 'sr':
            if not self.customized_head:
                # shift to [0, 1]
                preds = (preds + 1.0) / 2.0 
                # shift to [0, 255]
                preds = preds * 255
                # Clip output range
                preds = preds.clip(0, 255).cpu().numpy().astype(np.uint8)
            else:
                raise NotImplementedError

            pred_colored_img_list = []
            for i in range(preds.shape[0]):
                pred_colored_hwc = chw2hwc(preds[i])
                pred_colored_img = Image.fromarray(pred_colored_hwc)
                pred_colored_img_list.append(pred_colored_img)

            return GenPerceptOutput(
                pred_np=np.squeeze(preds),
                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
            )

        elif mode == 'normal':
            if not self.customized_head:
                preds = preds.clip(-1, 1).cpu().numpy() # [-1, 1]
            else:
                raise NotImplementedError

            pred_colored_img_list = []
            for i in range(preds.shape[0]):
                pred_colored_chw = norm_to_rgb(preds[i])
                pred_colored_hwc = chw2hwc(pred_colored_chw)
                normal_colored_img_i = Image.fromarray(pred_colored_hwc)
                pred_colored_img_list.append(normal_colored_img_i)

            return GenPerceptOutput(
                pred_np=np.squeeze(preds),
                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
            )

        else:
            raise NotImplementedError

    @torch.no_grad()
    def single_infer(
        self, 
        rgb_in: torch.Tensor, 
        mode: str = 'depth',
    ) -> torch.Tensor:
        """
        Perform an individual depth prediction without ensembling.

        Args:
            rgb_in (torch.Tensor):
                Input RGB image.
            num_inference_steps (int):
                Number of diffusion denoising steps (DDIM) during inference.
            show_pbar (bool):
                Display a progress bar of diffusion denoising.

        Returns:
            torch.Tensor: Predicted depth map.
        """
        device = rgb_in.device
        bs_imgs = rgb_in.shape[0]
        timesteps = torch.tensor([1]).long().repeat(bs_imgs).to(device)

        # Encode image
        rgb_latent = self.encode_rgb(rgb_in)

        batch_embed = self.empty_text_embed
        batch_embed = batch_embed.repeat((rgb_latent.shape[0], 1, 1)).to(device)   # [bs_imgs, 77, 1024]

        # Forward!
        if self.customized_head:
            unet_features = self.unet(rgb_latent, timesteps, encoder_hidden_states=batch_embed, return_feature_only=True)[0][::-1]
            pred = self.customized_head(unet_features)
        else:
            unet_output = self.unet(
                rgb_latent, timesteps, encoder_hidden_states=batch_embed
            )  # [bs_imgs, 4, h, w]
            unet_pred = unet_output.sample
            pred_latent = - unet_pred
            pred_latent.to(device)
            pred = self.decode_pred(pred_latent)
            if mode == 'depth':
                # mean of output channels
                pred = pred.mean(dim=1, keepdim=True)
            # clip prediction
            pred = torch.clip(pred, -1.0, 1.0)
        return pred


    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
        """
        Encode RGB image into latent.

        Args:
            rgb_in (torch.Tensor):
                Input RGB image to be encoded.

        Returns:
            torch.Tensor: Image latent
        """
        try:
            # encode
            h_temp = self.vae.encoder(rgb_in)
            moments = self.vae.quant_conv(h_temp)
        except:
            # encode
            h_temp = self.vae.encoder(rgb_in.float())
            moments = self.vae.quant_conv(h_temp.float())
            
        mean, logvar = torch.chunk(moments, 2, dim=1)
        # scale latent
        rgb_latent = mean * self.vae_scale_factor
        return rgb_latent

    def decode_pred(self, pred_latent: torch.Tensor) -> torch.Tensor:
        """
        Decode pred latent into pred label.

        Args:
            pred_latent (torch.Tensor):
                prediction latent to be decoded.

        Returns:
            torch.Tensor: Decoded prediction label.
        """
        # scale latent
        pred_latent = pred_latent / self.vae_scale_factor
        # decode
        z = self.vae.post_quant_conv(pred_latent)
        pred = self.vae.decoder(z)
        
        return pred