Spaces:

developy
/

ApDepth

Sleeping

App Files Files Community

developy commited on Sep 30

Commit

7f10cdc

verified ·

1 Parent(s): a7638ff

Upload 21 files

Browse files

Files changed (21) hide show

apdepth/__init__.py +21 -0
apdepth/__pycache__/__init__.cpython-310.pyc +0 -0
apdepth/__pycache__/__init__.cpython-312.pyc +0 -0
apdepth/__pycache__/__init__.cpython-38.pyc +0 -0
apdepth/__pycache__/marigold_pipeline.cpython-310.pyc +0 -0
apdepth/__pycache__/marigold_pipeline.cpython-312.pyc +0 -0
apdepth/__pycache__/marigold_pipeline.cpython-38.pyc +0 -0
apdepth/marigold_pipeline.py +480 -0
apdepth/modules/__pycache__/unet_2d_blocks.cpython-310.pyc +0 -0
apdepth/modules/__pycache__/unet_2d_condition.cpython-310.pyc +0 -0
apdepth/modules/unet_2d_blocks.py +0 -0
apdepth/modules/unet_2d_condition.py +1314 -0
apdepth/util/__pycache__/batchsize.cpython-310.pyc +0 -0
apdepth/util/__pycache__/batchsize.cpython-312.pyc +0 -0
apdepth/util/__pycache__/ensemble.cpython-310.pyc +0 -0
apdepth/util/__pycache__/ensemble.cpython-312.pyc +0 -0
apdepth/util/__pycache__/image_util.cpython-310.pyc +0 -0
apdepth/util/__pycache__/image_util.cpython-312.pyc +0 -0
apdepth/util/batchsize.py +81 -0
apdepth/util/ensemble.py +200 -0
apdepth/util/image_util.py +123 -0

apdepth/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from .marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput  # noqa: F401

apdepth/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (216 Bytes). View file

apdepth/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (229 Bytes). View file

apdepth/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (214 Bytes). View file

apdepth/__pycache__/marigold_pipeline.cpython-310.pyc ADDED Viewed

Binary file (15.4 kB). View file

apdepth/__pycache__/marigold_pipeline.cpython-312.pyc ADDED Viewed

Binary file (19.9 kB). View file

apdepth/__pycache__/marigold_pipeline.cpython-38.pyc ADDED Viewed

Binary file (15 kB). View file

apdepth/marigold_pipeline.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+# Last modified: 2024-05-24
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import logging
+from typing import Dict, Optional, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LCMScheduler,
+    # UNet2DConditionModel,
+)
+from marigold.modules.unet_2d_condition import UNet2DConditionModel
+from diffusers.utils import BaseOutput
+from PIL import Image
+from torch.utils.data import DataLoader, TensorDataset
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import pil_to_tensor, resize
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from .util.batchsize import find_batch_size
+from .util.ensemble import ensemble_depth
+from .util.image_util import (
+    chw2hwc,
+    colorize_depth_maps,
+    get_tv_resample_method,
+    resize_max_res,
+)
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+    Args:
+        depth_np (`np.ndarray`):
+            Predicted depth map, with depth values in the range of [0, 1].
+        depth_colored (`PIL.Image.Image`):
+            Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
+        uncertainty (`None` or `np.ndarray`):
+            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
+    """
+    depth_np: np.ndarray
+    depth_colored: Union[None, Image.Image]
+    uncertainty: Union[None, np.ndarray]
+class MarigoldPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+        scale_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are scale-invariant. This value must be set in
+            the model config. When used together with the `shift_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        shift_invariant (`bool`, *optional*):
+            A model property specifying whether the predicted depth maps are shift-invariant. This value must be set in
+            the model config. When used together with the `scale_invariant=True` flag, the model is also called
+            "affine-invariant". NB: overriding this value is not supported.
+        default_denoising_steps (`int`, *optional*):
+            The minimum number of denoising diffusion steps that are required to produce a prediction of reasonable
+            quality with the given model. This value must be set in the model config. When the pipeline is called
+            without explicitly setting `num_inference_steps`, the default value is used. This is required to ensure
+            reasonable results with various model flavors compatible with the pipeline, such as those relying on very
+            short denoising schedules (`LCMScheduler`) and those with full diffusion schedules (`DDIMScheduler`).
+        default_processing_resolution (`int`, *optional*):
+            The recommended value of the `processing_resolution` parameter of the pipeline. This value must be set in
+            the model config. When the pipeline is called without explicitly setting `processing_resolution`, the
+            default value is used. This is required to ensure reasonable results with various model flavors trained
+            with varying optimal processing resolution values.
+    """
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, LCMScheduler],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        scale_invariant: Optional[bool] = True,
+        shift_invariant: Optional[bool] = True,
+        default_denoising_steps: Optional[int] = None,
+        default_processing_resolution: Optional[int] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.register_to_config(
+            scale_invariant=scale_invariant,
+            shift_invariant=shift_invariant,
+            default_denoising_steps=default_denoising_steps,
+            default_processing_resolution=default_processing_resolution,
+        )
+        self.scale_invariant = scale_invariant
+        self.shift_invariant = shift_invariant
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+        self.empty_text_embed = None
+        self._fft_masks = {}
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Union[Image.Image, torch.Tensor],
+        denoising_steps: Optional[int] = None,
+        ensemble_size: int = 5,
+        processing_res: Optional[int] = None,
+        match_input_res: bool = True,
+        resample_method: str = "bilinear",
+        batch_size: int = 0,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+    ) -> MarigoldDepthOutput:
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            denoising_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            processing_res (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, processes at the original image resolution. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize depth prediction to match input resolution.
+                Only valid if `processing_res` > 0.
+            resample_method: (`str`, *optional*, defaults to `bilinear`):
+                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            generator (`torch.Generator`, *optional*, defaults to `None`)
+                Random generator for initial noise generation.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
+                Colormap used to colorize the depth map.
+            scale_invariant (`str`, *optional*, defaults to `True`):
+                Flag of scale-invariant prediction, if True, scale will be adjusted from the raw prediction.
+            shift_invariant (`str`, *optional*, defaults to `True`):
+                Flag of shift-invariant prediction, if True, shift will be adjusted from the raw prediction, if False, near plane will be fixed at 0m.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
+            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
+            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and values in [0, 1], None if `color_map` is `None`
+            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
+                    coming from ensembling. None if `ensemble_size = 1`
+        """
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if processing_res is None:
+            processing_res = self.default_processing_resolution
+        assert processing_res >= 0
+        # Check if denoising step is reasonable
+        self._check_inference_step(denoising_steps)
+        resample_method: InterpolationMode = get_tv_resample_method(resample_method)
+        # ----------------- Image Preprocess -----------------
+        # Convert to torch tensor
+        if isinstance(input_image, Image.Image):
+            input_image = input_image.convert("RGB")
+            # convert to torch tensor [H, W, rgb] -> [rgb, H, W]
+            rgb = pil_to_tensor(input_image)
+            rgb = rgb.unsqueeze(0)  # [1, rgb, H, W]
+        elif isinstance(input_image, torch.Tensor):
+            rgb = input_image
+        else:
+            raise TypeError(f"Unknown input type: {type(input_image) = }")
+        input_size = rgb.shape
+        assert (
+            4 == rgb.dim() and 3 == input_size[-3]
+        ), f"Wrong input shape {input_size}, expected [1, rgb, H, W]"
+        # Resize image
+        if processing_res > 0:
+            rgb = resize_max_res(
+                rgb,
+                max_edge_resolution=processing_res,
+                resample_method=resample_method,
+            )
+        # Normalize rgb values
+        rgb_norm: torch.Tensor = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
+        rgb_norm = rgb_norm.to(self.dtype)
+        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+        # ----------------- Predicting depth -----------------
+        # Batch repeated input image
+        duplicated_rgb = rgb_norm.expand(1, -1, -1, -1)
+        single_rgb_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = 1
+        single_rgb_loader = DataLoader(
+            single_rgb_dataset, batch_size=_bs, shuffle=False
+        )
+        # Predict depth maps (batched)
+        depth_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(
+                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
+            )
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img,) = batch # here the image is still around 0-1
+            depth_pred_raw = self.single_infer(
+                rgb_in=batched_img,
+            )
+            depth_pred_ls.append(depth_pred_raw.detach())
+        depth_preds = torch.concat(depth_pred_ls, dim=0)
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            depth_pred, pred_uncert = ensemble_depth(
+                depth_preds,
+                scale_invariant=self.scale_invariant,
+                shift_invariant=self.shift_invariant,
+                max_res=50,
+                **(ensemble_kwargs or {}),
+            )
+        else:
+            depth_pred = depth_preds
+            pred_uncert = None
+        # Resize back to original resolution
+        if match_input_res:
+            depth_pred = resize(
+                depth_pred,
+                input_size[-2:],
+                interpolation=resample_method,
+                antialias=True,
+            )
+        # Convert to numpy
+        depth_pred = depth_pred.squeeze()
+        depth_pred = depth_pred.cpu().numpy()
+        if pred_uncert is not None:
+            pred_uncert = pred_uncert.squeeze().cpu().numpy()
+        # Clip output range
+        depth_pred = depth_pred.clip(0, 1)
+        # Colorize
+        if color_map is not None:
+            depth_colored = colorize_depth_maps(
+                depth_pred, 0, 1, cmap=color_map
+            ).squeeze()  # [3, H, W], value in (0, 1)
+            depth_colored = (depth_colored * 255).astype(np.uint8)
+            depth_colored_hwc = chw2hwc(depth_colored)
+            depth_colored_img = Image.fromarray(depth_colored_hwc)
+        else:
+            depth_colored_img = None
+        return MarigoldDepthOutput(
+            depth_np=depth_pred,
+            depth_colored=depth_colored_img,
+            uncertainty=pred_uncert,
+        )
+    def _check_inference_step(self, n_step: int) -> None:
+        """
+        Check if denoising step is reasonable
+        Args:
+            n_step (`int`): denoising steps
+        """
+        assert n_step >= 1
+        if isinstance(self.scheduler, DDIMScheduler):
+            if n_step < 10:
+                logging.warning(
+                    f"Too few denoising steps: {n_step}. Recommended to use the LCM checkpoint for few-step inference."
+                )
+        elif isinstance(self.scheduler, LCMScheduler):
+            if not 1 <= n_step <= 4:
+                logging.warning(
+                    f"Non-optimal setting of denoising steps: {n_step}. Recommended setting is 1-4 steps."
+                )
+        else:
+            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
+    def encode_empty_text(self):
+        """
+        Encode text embedding for empty prompt
+        """
+        prompt = ""
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
+    @torch.no_grad()
+    def _get_highpass_mask(self, H, W, radius, device):
+        key = (H, W, radius, device)
+        if key not in self._fft_masks:
+            yy, xx = torch.meshgrid(torch.arange(H, device=device),
+                                    torch.arange(W, device=device),
+                                    indexing="ij")
+            yy = yy - H // 2
+            xx = xx - W // 2
+            mask_low = (xx**2 + yy**2 <= radius**2).float()
+            mask_high = 1 - mask_low
+            mask_high = mask_high[None, None, :, :]
+            self._fft_masks[key] = mask_high
+        return self._fft_masks[key]
+    @torch.no_grad()
+    def rgb_fft(self, x: torch.Tensor, highpass_radius: int = 30):
+        B, C, H, W = x.shape
+        device = x.device
+        f = torch.fft.fft2(x, norm="ortho")
+        fshift = torch.fft.fftshift(f)
+        mask_high = self._get_highpass_mask(H, W, highpass_radius, device)
+        fshift_high = fshift * mask_high
+        f_ishift = torch.fft.ifftshift(fshift_high)
+        img_high = torch.fft.ifft2(f_ishift, norm="ortho").real
+        return img_high
+    @torch.no_grad()
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Perform an individual depth prediction without ensembling.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+            generator (`torch.Generator`)
+                Random generator for initial noise generation.
+        Returns:
+            `torch.Tensor`: Predicted depth map.
+        """
+        device = self.device
+        rgb_in = rgb_in.to(device)
+        rgb_fft = self.rgb_fft(rgb_in)
+        # Encode image
+        rgb_latent = self.encode_rgb(rgb_in)
+        # rgb_fft_latent = self.encode_rgb(rgb_fft)
+        # Batched empty text embedding
+        if self.empty_text_embed is None:
+            self.encode_empty_text()
+        batch_empty_text_embed = self.empty_text_embed.repeat(
+            (rgb_latent.shape[0], 1, 1)
+        ).to(device)  # [B, 2, 1024]
+        # unet_input = torch.cat([rgb_latent, rgb_fft_latent],dim=1)
+        depth_latent = self.unet(
+            rgb_latent, 1, encoder_hidden_states=batch_empty_text_embed
+        ).sample  # [B, 4, h, w]
+        depth = self.decode_depth(depth_latent)
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+        return depth
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.rgb_latent_scale_factor
+        return rgb_latent
+    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode depth latent into depth map.
+        Args:
+            depth_latent (`torch.Tensor`):
+                Depth latent to be decoded.
+        Returns:
+            `torch.Tensor`: Decoded depth map.
+        """
+        # scale latent
+        depth_latent = depth_latent / self.depth_latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(depth_latent)
+        stacked = self.vae.decoder(z)
+        # mean of output channels
+        depth_mean = stacked.mean(dim=1, keepdim=True)
+        return depth_mean

apdepth/modules/__pycache__/unet_2d_blocks.cpython-310.pyc ADDED Viewed

Binary file (67.3 kB). View file

apdepth/modules/__pycache__/unet_2d_condition.cpython-310.pyc ADDED Viewed

Binary file (40.8 kB). View file

apdepth/modules/unet_2d_blocks.py ADDED Viewed

The diff for this file is too large to render. See raw diff

apdepth/modules/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,1314 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    GLIGENTextBoundingboxProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from marigold.modules.unet_2d_blocks import (
+    get_down_block,
+    get_mid_block,
+    get_up_block,
+    BlockFE,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.Tensor = None
+class UNet2DConditionModel(
+    ModelMixin, ConfigMixin, FromOriginalModelMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin
+):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"]
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: float = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+        # print('loaded correct file')
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        self._check_config(
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+        )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        time_embed_dim, timestep_input_dim = self._set_time_proj(
+            time_embedding_type,
+            block_out_channels=block_out_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            time_embedding_dim=time_embedding_dim,
+        )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        self._set_encoder_hid_proj(
+            encoder_hid_dim_type,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+        )
+        # class embedding
+        self._set_class_embedding(
+            class_embed_type,
+            act_fn=act_fn,
+            num_class_embeds=num_class_embeds,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+            timestep_input_dim=timestep_input_dim,
+        )
+        self._set_add_embedding(
+            addition_embed_type,
+            addition_embed_type_num_heads=addition_embed_type_num_heads,
+            addition_time_embed_dim=addition_time_embed_dim,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+        )
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            temb_channels=blocks_time_embed_dim,
+            in_channels=block_out_channels[-1],
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            output_scale_factor=mid_block_scale_factor,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            num_attention_heads=num_attention_heads[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            mid_block_only_cross_attention=mid_block_only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+            resnet_skip_time_act=resnet_skip_time_act,
+            cross_attention_norm=cross_attention_norm,
+            attention_head_dim=attention_head_dim[-1],
+            dropout=dropout,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+        self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
+    def _check_config(
+        self,
+        down_block_types: Tuple[str],
+        up_block_types: Tuple[str],
+        only_cross_attention: Union[bool, Tuple[bool]],
+        block_out_channels: Tuple[int],
+        layers_per_block: Union[int, Tuple[int]],
+        cross_attention_dim: Union[int, Tuple[int]],
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
+        reverse_transformer_layers_per_block: bool,
+        attention_head_dim: int,
+        num_attention_heads: Optional[Union[int, Tuple[int]]],
+    ):
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+    def _set_time_proj(
+        self,
+        time_embedding_type: str,
+        block_out_channels: int,
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        time_embedding_dim: int,
+    ) -> Tuple[int, int]:
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        return time_embed_dim, timestep_input_dim
+    def _set_encoder_hid_proj(
+        self,
+        encoder_hid_dim_type: Optional[str],
+        cross_attention_dim: Union[int, Tuple[int]],
+        encoder_hid_dim: Optional[int],
+    ):
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+    def _set_class_embedding(
+        self,
+        class_embed_type: Optional[str],
+        act_fn: str,
+        num_class_embeds: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+        timestep_input_dim: int,
+    ):
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+    def _set_add_embedding(
+        self,
+        addition_embed_type: str,
+        addition_embed_type_num_heads: int,
+        addition_time_embed_dim: Optional[int],
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        cross_attention_dim: Optional[int],
+        encoder_hid_dim: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+    ):
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+    def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, (list, tuple)):
+                positive_len = cross_attention_dim[0]
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = GLIGENTextBoundingboxProjection(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedAttnProcessor2_0())
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def get_time_embed(
+        self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
+    ) -> Optional[torch.Tensor]:
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        return t_emb
+    def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+        class_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+        return class_emb
+    def get_aug_embed(
+        self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> Optional[torch.Tensor]:
+        aug_emb = None
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb = self.add_embedding(image_embs, hint)
+        return aug_emb
+    def process_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> torch.Tensor:
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            if hasattr(self, "text_encoder_hid_proj") and self.text_encoder_hid_proj is not None:
+                encoder_hidden_states = self.text_encoder_hid_proj(encoder_hidden_states)
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+        return encoder_hidden_states
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        if self.config.addition_embed_type == "image_hint":
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+        if cross_attention_kwargs is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

apdepth/util/__pycache__/batchsize.cpython-310.pyc ADDED Viewed

Binary file (1.73 kB). View file

apdepth/util/__pycache__/batchsize.cpython-312.pyc ADDED Viewed

Binary file (2.68 kB). View file

apdepth/util/__pycache__/ensemble.cpython-310.pyc ADDED Viewed

Binary file (6.51 kB). View file

apdepth/util/__pycache__/ensemble.cpython-312.pyc ADDED Viewed

Binary file (10.2 kB). View file

apdepth/util/__pycache__/image_util.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

apdepth/util/__pycache__/image_util.cpython-312.pyc ADDED Viewed

Binary file (4.87 kB). View file

apdepth/util/batchsize.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import torch
+import math
+# Search table for suggested max. inference batch size
+bs_search_table = [
+    # tested on A100-PCIE-80GB
+    {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
+    {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
+    # tested on A100-PCIE-40GB
+    {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
+    {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
+    {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
+    {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
+    # tested on RTX3090, RTX4090
+    {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
+    {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
+    {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
+    {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
+    {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
+    {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
+    # tested on GTX1080Ti
+    {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
+    {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
+    {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
+    {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
+    {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
+]
+def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
+    """
+    Automatically search for suitable operating batch size.
+    Args:
+        ensemble_size (`int`):
+            Number of predictions to be ensembled.
+        input_res (`int`):
+            Operating resolution of the input image.
+    Returns:
+        `int`: Operating batch size.
+    """
+    if not torch.cuda.is_available():
+        return 1
+    total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
+    filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
+    for settings in sorted(
+        filtered_bs_search_table,
+        key=lambda k: (k["res"], -k["total_vram"]),
+    ):
+        if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
+            bs = settings["bs"]
+            if bs > ensemble_size:
+                bs = ensemble_size
+            elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
+                bs = math.ceil(ensemble_size / 2)
+            return bs
+    return 1

apdepth/util/ensemble.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from functools import partial
+from typing import Optional, Tuple
+import numpy as np
+import torch
+from .image_util import get_tv_resample_method, resize_max_res
+def inter_distances(tensors: torch.Tensor):
+    """
+    To calculate the distance between each two depth maps.
+    """
+    distances = []
+    for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+        arr1 = tensors[i : i + 1]
+        arr2 = tensors[j : j + 1]
+        distances.append(arr1 - arr2)
+    dist = torch.concatenate(distances, dim=0)
+    return dist
+def ensemble_depth(
+    depth: torch.Tensor,
+    scale_invariant: bool = True,
+    shift_invariant: bool = True,
+    output_uncertainty: bool = False,
+    reduction: str = "median",
+    regularizer_strength: float = 0.02,
+    max_iter: int = 2,
+    tol: float = 1e-3,
+    max_res: int = 1024,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Ensembles depth maps represented by the `depth` tensor with expected shape `(B, 1, H, W)`, where B is the
+    number of ensemble members for a given prediction of size `(H x W)`. Even though the function is designed for
+    depth maps, it can also be used with disparity maps as long as the input tensor values are non-negative. The
+    alignment happens when the predictions have one or more degrees of freedom, that is when they are either
+    affine-invariant (`scale_invariant=True` and `shift_invariant=True`), or just scale-invariant (only
+    `scale_invariant=True`). For absolute predictions (`scale_invariant=False` and `shift_invariant=False`)
+    alignment is skipped and only ensembling is performed.
+    Args:
+        depth (`torch.Tensor`):
+            Input ensemble depth maps.
+        scale_invariant (`bool`, *optional*, defaults to `True`):
+            Whether to treat predictions as scale-invariant.
+        shift_invariant (`bool`, *optional*, defaults to `True`):
+            Whether to treat predictions as shift-invariant.
+        output_uncertainty (`bool`, *optional*, defaults to `False`):
+            Whether to output uncertainty map.
+        reduction (`str`, *optional*, defaults to `"median"`):
+            Reduction method used to ensemble aligned predictions. The accepted values are: `"mean"` and
+            `"median"`.
+        regularizer_strength (`float`, *optional*, defaults to `0.02`):
+            Strength of the regularizer that pulls the aligned predictions to the unit range from 0 to 1.
+        max_iter (`int`, *optional*, defaults to `2`):
+            Maximum number of the alignment solver steps. Refer to `scipy.optimize.minimize` function, `options`
+            argument.
+        tol (`float`, *optional*, defaults to `1e-3`):
+            Alignment solver tolerance. The solver stops when the tolerance is reached.
+        max_res (`int`, *optional*, defaults to `1024`):
+            Resolution at which the alignment is performed; `None` matches the `processing_resolution`.
+    Returns:
+        A tensor of aligned and ensembled depth maps and optionally a tensor of uncertainties of the same shape:
+        `(1, 1, H, W)`.
+    """
+    if depth.dim() != 4 or depth.shape[1] != 1:
+        raise ValueError(f"Expecting 4D tensor of shape [B,1,H,W]; got {depth.shape}.")
+    if reduction not in ("mean", "median"):
+        raise ValueError(f"Unrecognized reduction method: {reduction}.")
+    if not scale_invariant and shift_invariant:
+        raise ValueError("Pure shift-invariant ensembling is not supported.")
+    def init_param(depth: torch.Tensor):
+        init_min = depth.reshape(ensemble_size, -1).min(dim=1).values
+        init_max = depth.reshape(ensemble_size, -1).max(dim=1).values
+        if scale_invariant and shift_invariant:
+            init_s = 1.0 / (init_max - init_min).clamp(min=1e-6)
+            init_t = -init_s * init_min
+            param = torch.cat((init_s, init_t)).cpu().numpy()
+        elif scale_invariant:
+            init_s = 1.0 / init_max.clamp(min=1e-6)
+            param = init_s.cpu().numpy()
+        else:
+            raise ValueError("Unrecognized alignment.")
+        return param
+    def align(depth: torch.Tensor, param: np.ndarray) -> torch.Tensor:
+        if scale_invariant and shift_invariant:
+            s, t = np.split(param, 2)
+            s = torch.from_numpy(s).to(depth).view(ensemble_size, 1, 1, 1)
+            t = torch.from_numpy(t).to(depth).view(ensemble_size, 1, 1, 1)
+            out = depth * s + t
+        elif scale_invariant:
+            s = torch.from_numpy(param).to(depth).view(ensemble_size, 1, 1, 1)
+            out = depth * s
+        else:
+            raise ValueError("Unrecognized alignment.")
+        return out
+    def ensemble(
+        depth_aligned: torch.Tensor, return_uncertainty: bool = False
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        uncertainty = None
+        if reduction == "mean":
+            prediction = torch.mean(depth_aligned, dim=0, keepdim=True)
+            if return_uncertainty:
+                uncertainty = torch.std(depth_aligned, dim=0, keepdim=True)
+        elif reduction == "median":
+            prediction = torch.median(depth_aligned, dim=0, keepdim=True).values
+            if return_uncertainty:
+                uncertainty = torch.median(
+                    torch.abs(depth_aligned - prediction), dim=0, keepdim=True
+                ).values
+        else:
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        return prediction, uncertainty
+    def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
+        cost = 0.0
+        depth_aligned = align(depth, param)
+        for i, j in torch.combinations(torch.arange(ensemble_size)):
+            diff = depth_aligned[i] - depth_aligned[j]
+            cost += (diff**2).mean().sqrt().item()
+        if regularizer_strength > 0:
+            prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
+            err_near = (0.0 - prediction.min()).abs().item()
+            err_far = (1.0 - prediction.max()).abs().item()
+            cost += (err_near + err_far) * regularizer_strength
+        return cost
+    def compute_param(depth: torch.Tensor):
+        import scipy
+        depth_to_align = depth.to(torch.float32)
+        if max_res is not None and max(depth_to_align.shape[2:]) > max_res:
+            depth_to_align = resize_max_res(
+                depth_to_align, max_res, get_tv_resample_method("nearest-exact")
+            )
+        param = init_param(depth_to_align)
+        res = scipy.optimize.minimize(
+            partial(cost_fn, depth=depth_to_align),
+            param,
+            method="BFGS",
+            tol=tol,
+            options={"maxiter": max_iter, "disp": False},
+        )
+        return res.x
+    requires_aligning = scale_invariant or shift_invariant
+    ensemble_size = depth.shape[0]
+    if requires_aligning:
+        param = compute_param(depth)
+        depth = align(depth, param)
+    depth, uncertainty = ensemble(depth, return_uncertainty=output_uncertainty)
+    depth_max = depth.max()
+    if scale_invariant and shift_invariant:
+        depth_min = depth.min()
+    elif scale_invariant:
+        depth_min = 0
+    else:
+        raise ValueError("Unrecognized alignment.")
+    depth_range = (depth_max - depth_min).clamp(min=1e-6)
+    depth = (depth - depth_min) / depth_range
+    if output_uncertainty:
+        uncertainty /= depth_range
+    return depth, uncertainty  # [1,1,H,W], [1,1,H,W]

apdepth/util/image_util.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+# Last modified: 2024-05-24
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import matplotlib
+import numpy as np
+import torch
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import resize
+def colorize_depth_maps(
+    depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
+):
+    """
+    Colorize depth maps.
+    """
+    assert len(depth_map.shape) >= 2, "Invalid dimension"
+    if isinstance(depth_map, torch.Tensor):
+        depth = depth_map.detach().squeeze().numpy()
+    elif isinstance(depth_map, np.ndarray):
+        depth = depth_map.copy().squeeze()
+    # reshape to [ (B,) H, W ]
+    if depth.ndim < 3:
+        depth = depth[np.newaxis, :, :]
+    # colorize
+    cm = matplotlib.colormaps[cmap]
+    depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+    img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+    img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+    if valid_mask is not None:
+        if isinstance(depth_map, torch.Tensor):
+            valid_mask = valid_mask.detach().numpy()
+        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+        if valid_mask.ndim < 3:
+            valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+        else:
+            valid_mask = valid_mask[:, np.newaxis, :, :]
+        valid_mask = np.repeat(valid_mask, 3, axis=1)
+        img_colored_np[~valid_mask] = 0
+    if isinstance(depth_map, torch.Tensor):
+        img_colored = torch.from_numpy(img_colored_np).float()
+    elif isinstance(depth_map, np.ndarray):
+        img_colored = img_colored_np
+    return img_colored
+def chw2hwc(chw):
+    assert 3 == len(chw.shape)
+    if isinstance(chw, torch.Tensor):
+        hwc = torch.permute(chw, (1, 2, 0))
+    elif isinstance(chw, np.ndarray):
+        hwc = np.moveaxis(chw, 0, -1)
+    return hwc
+def resize_max_res(
+    img: torch.Tensor,
+    max_edge_resolution: int,
+    resample_method: InterpolationMode = InterpolationMode.BILINEAR,
+) -> torch.Tensor:
+    """
+    Resize image to limit maximum edge length while keeping aspect ratio.
+    Args:
+        img (`torch.Tensor`):
+            Image tensor to be resized. Expected shape: [B, C, H, W]
+        max_edge_resolution (`int`):
+            Maximum edge length (pixel).
+        resample_method (`PIL.Image.Resampling`):
+            Resampling method used to resize images.
+    Returns:
+        `torch.Tensor`: Resized image.
+    """
+    assert 4 == img.dim(), f"Invalid input shape {img.shape}"
+    original_height, original_width = img.shape[-2:]
+    downscale_factor = min(
+        max_edge_resolution / original_width, max_edge_resolution / original_height
+    )
+    new_width = int(original_width * downscale_factor)
+    new_height = int(original_height * downscale_factor)
+    resized_img = resize(img, (new_height, new_width), resample_method, antialias=True)
+    return resized_img
+def get_tv_resample_method(method_str: str) -> InterpolationMode:
+    resample_method_dict = {
+        "bilinear": InterpolationMode.BILINEAR,
+        "bicubic": InterpolationMode.BICUBIC,
+        "nearest": InterpolationMode.NEAREST_EXACT,
+        "nearest-exact": InterpolationMode.NEAREST_EXACT,
+    }
+    resample_method = resample_method_dict.get(method_str, None)
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {resample_method}")
+    else:
+        return resample_method