Spaces:

Ashoka74
/

Demo_Refurnish

Running on Zero

+import os
+from typing import Dict, Optional, Union
+import safetensors
+import torch
+from diffusers.utils import _get_model_file, logging
+from safetensors import safe_open
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CustomAdapterMixin:
+    def init_custom_adapter(self, *args, **kwargs):
+        self._init_custom_adapter(*args, **kwargs)
+    def _init_custom_adapter(self, *args, **kwargs):
+        raise NotImplementedError
+    def load_custom_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        weight_name: str,
+        subfolder: Optional[str] = None,
+        **kwargs,
+    ):
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                subfolder=subfolder,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                user_agent=user_agent,
+            )
+            if weight_name.endswith(".safetensors"):
+                state_dict = {}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        self._load_custom_adapter(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        raise NotImplementedError
+    def save_custom_adapter(
+        self,
+        save_directory: Union[str, os.PathLike],
+        weight_name: str,
+        safe_serialization: bool = False,
+        **kwargs,
+    ):
+        if os.path.isfile(save_directory):
+            logger.error(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+            return
+        if safe_serialization:
+            def save_function(weights, filename):
+                return safetensors.torch.save_file(
+                    weights, filename, metadata={"format": "pt"}
+                )
+        else:
+            save_function = torch.save
+        # Save the model
+        state_dict = self._save_custom_adapter(**kwargs)
+        save_function(state_dict, os.path.join(save_directory, weight_name))
+        logger.info(
+            f"Custom adapter weights saved in {os.path.join(save_directory, weight_name)}"
+        )
+    def _save_custom_adapter(self):
+        raise NotImplementedError

mvadapter/models/__init__.py ADDED Viewed

File without changes

mvadapter/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import math
+from typing import Callable, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from diffusers.models.unets import UNet2DConditionModel
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from einops import rearrange
+from torch import nn
+def default_set_attn_proc_func(
+    name: str,
+    hidden_size: int,
+    cross_attention_dim: Optional[int],
+    ori_attn_proc: object,
+) -> object:
+    return ori_attn_proc
+def set_unet_2d_condition_attn_processor(
+    unet: UNet2DConditionModel,
+    set_self_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_cross_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_custom_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_self_attn_module_names: Optional[List[str]] = None,
+    set_cross_attn_module_names: Optional[List[str]] = None,
+    set_custom_attn_module_names: Optional[List[str]] = None,
+) -> None:
+    do_set_processor = lambda name, module_names: (
+        any([name.startswith(module_name) for module_name in module_names])
+        if module_names is not None
+        else True
+    )  # prefix match
+    attn_procs = {}
+    for name, attn_processor in unet.attn_processors.items():
+        # set attn_processor by default, if module_names is None
+        set_self_attn_processor = do_set_processor(name, set_self_attn_module_names)
+        set_cross_attn_processor = do_set_processor(name, set_cross_attn_module_names)
+        set_custom_attn_processor = do_set_processor(name, set_custom_attn_module_names)
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        is_custom = "attn_mid_blocks" in name or "attn_post_blocks" in name
+        if is_custom:
+            attn_procs[name] = (
+                set_custom_attn_proc_func(name, hidden_size, None, attn_processor)
+                if set_custom_attn_processor
+                else attn_processor
+            )
+        else:
+            cross_attention_dim = (
+                None
+                if name.endswith("attn1.processor")
+                else unet.config.cross_attention_dim
+            )
+            if cross_attention_dim is None or "motion_modules" in name:
+                # self attention
+                attn_procs[name] = (
+                    set_self_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_self_attn_processor
+                    else attn_processor
+                )
+            else:
+                # cross attention
+                attn_procs[name] = (
+                    set_cross_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_cross_attn_processor
+                    else attn_processor
+                )
+    unet.set_attn_processor(attn_procs)
+class DecoupledMVRowSelfAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for Decoupled Row-wise Self-Attention and Image Cross-Attention for PyTorch 2.0.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        inner_dim: int,
+        num_views: int = 1,
+        name: Optional[str] = None,
+        use_mv: bool = True,
+        use_ref: bool = False,
+    ):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "DecoupledMVRowSelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        super().__init__()
+        self.num_views = num_views
+        self.name = name  # NOTE: need for image cross-attention
+        self.use_mv = use_mv
+        self.use_ref = use_ref
+        if self.use_mv:
+            self.to_q_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_mv = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+        if self.use_ref:
+            self.to_q_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_ref = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        mv_scale: float = 1.0,
+        ref_hidden_states: Optional[torch.FloatTensor] = None,
+        ref_scale: float = 1.0,
+        cache_hidden_states: Optional[List[torch.FloatTensor]] = None,
+        use_mv: bool = True,
+        use_ref: bool = True,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        New args:
+            mv_scale (float): scale for multi-view self-attention.
+            ref_hidden_states (torch.FloatTensor): reference encoder hidden states for image cross-attention.
+            ref_scale (float): scale for image cross-attention.
+            cache_hidden_states (List[torch.FloatTensor]): cache hidden states from reference unet.
+        """
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        # NEW: cache hidden states for reference unet
+        if cache_hidden_states is not None:
+            cache_hidden_states[self.name] = hidden_states.clone()
+        # NEW: whether to use multi-view attention and image cross-attention
+        use_mv = self.use_mv and use_mv
+        use_ref = self.use_ref and use_ref
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        # NEW: for decoupled multi-view attention
+        if use_mv:
+            query_mv = self.to_q_mv(hidden_states)
+        # NEW: for decoupled reference cross attention
+        if use_ref:
+            query_ref = self.to_q_ref(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        ####### Decoupled multi-view self-attention ########
+        if use_mv:
+            key_mv = self.to_k_mv(encoder_hidden_states)
+            value_mv = self.to_v_mv(encoder_hidden_states)
+            query_mv = query_mv.view(batch_size, -1, attn.heads, head_dim)
+            key_mv = key_mv.view(batch_size, -1, attn.heads, head_dim)
+            value_mv = value_mv.view(batch_size, -1, attn.heads, head_dim)
+            height = width = math.isqrt(sequence_length)
+            # row self-attention
+            query_mv = rearrange(
+                query_mv,
+                "(b nv) (ih iw) h c -> (b nv ih) iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            ).transpose(1, 2)
+            key_mv = rearrange(
+                key_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            key_mv = (
+                key_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            value_mv = rearrange(
+                value_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            value_mv = (
+                value_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            hidden_states_mv = F.scaled_dot_product_attention(
+                query_mv,
+                key_mv,
+                value_mv,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            hidden_states_mv = rearrange(
+                hidden_states_mv,
+                "(b nv ih) h iw c -> (b nv) (ih iw) (h c)",
+                nv=self.num_views,
+                ih=height,
+            )
+            hidden_states_mv = hidden_states_mv.to(query.dtype)
+            # linear proj
+            hidden_states_mv = self.to_out_mv[0](hidden_states_mv)
+            # dropout
+            hidden_states_mv = self.to_out_mv[1](hidden_states_mv)
+        if use_ref:
+            reference_hidden_states = ref_hidden_states[self.name]
+            key_ref = self.to_k_ref(reference_hidden_states)
+            value_ref = self.to_v_ref(reference_hidden_states)
+            query_ref = query_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            key_ref = key_ref.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value_ref = value_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            hidden_states_ref = F.scaled_dot_product_attention(
+                query_ref, key_ref, value_ref, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states_ref = hidden_states_ref.to(query.dtype)
+            # linear proj
+            hidden_states_ref = self.to_out_ref[0](hidden_states_ref)
+            # dropout
+            hidden_states_ref = self.to_out_ref[1](hidden_states_ref)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if use_mv:
+            hidden_states = hidden_states + hidden_states_mv * mv_scale
+        if use_ref:
+            hidden_states = hidden_states + hidden_states_ref * ref_scale
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def set_num_views(self, num_views: int) -> None:
+        self.num_views = num_views

mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,953 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import (
+    AutoencoderKL,
+    ImageProjection,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
+    StableDiffusionXLPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class MVAdapterI2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
+            add_watermarker=add_watermarker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.prepare_latents
+    def prepare_image_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        latents_mean = latents_std = None
+        if (
+            hasattr(self.vae.config, "latents_mean")
+            and self.vae.config.latents_mean is not None
+        ):
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if (
+            hasattr(self.vae.config, "latents_std")
+            and self.vae.config.latents_std is not None
+        ):
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+                init_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i : i + 1]), generator=generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(
+                    self.vae.encode(image), generator=generator
+                )
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (
+                    (init_latents - latents_mean)
+                    * self.vae.config.scaling_factor
+                    / latents_std
+                )
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+        if (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] == 0
+        ):
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        num_empty_images=0,  # for concat in batch like ImageDream
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        if num_empty_images > 0:
+            image = torch.cat(
+                [image, torch.zeros_like(image[:num_empty_images])], dim=0
+            )
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Image condition
+        reference_image: Optional[PipelineImageInput] = None,
+        reference_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # Preprocess reference image
+        reference_image = self.image_processor.preprocess(reference_image)
+        reference_latents = self.prepare_image_latents(
+            reference_image,
+            timesteps[:1].repeat(batch_size * num_images_per_prompt),  # no use
+            batch_size,
+            1,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise=False,
+        )
+        with torch.no_grad():
+            ref_timesteps = torch.zeros_like(timesteps[0])
+            ref_hidden_states = {}
+            self.unet(
+                reference_latents,
+                ref_timesteps,
+                encoder_hidden_states=prompt_embeds[-1:],
+                added_cond_kwargs={
+                    "text_embeds": add_text_embeds[-1:],
+                    "time_ids": add_time_ids[-1:],
+                },
+                cross_attention_kwargs={
+                    "cache_hidden_states": ref_hidden_states,
+                    "use_mv": False,
+                    "use_ref": False,
+                },
+                return_dict=False,
+            )
+            ref_hidden_states = {
+                k: v.repeat_interleave(num_images_per_prompt, dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        if self.do_classifier_free_guidance:
+            ref_hidden_states = {
+                k: torch.cat([torch.zeros_like(v), v], dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        cross_attention_kwargs = {
+            "mv_scale": mv_scale,
+            "ref_hidden_states": {k: v.clone() for k, v in ref_hidden_states.items()},
+            "ref_scale": reference_conditioning_scale,
+            **(self.cross_attention_kwargs or {}),
+        }
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    add_text_embeds = callback_outputs.pop(
+                        "add_text_embeds", add_text_embeds
+                    )
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop(
+                        "negative_add_time_ids", negative_add_time_ids
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = (
+                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            )
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(
+                    next(iter(self.vae.post_quant_conv.parameters())).dtype
+                )
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae.config, "latents_mean")
+                and self.vae.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae.config, "latents_std")
+                and self.vae.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / self.vae.config.scaling_factor
+                    + latents_mean
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+        # set custom attn processor for multi-view attention and image cross-attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=True,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                elif "_ref" in key:
+                    compatible_key = key.replace("_ref", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/pipelines/pipeline_mvadapter_t2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,792 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, T2IAdapter, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
+    StableDiffusionXLPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class MVAdapterT2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
+            add_watermarker=add_watermarker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Optional. controlnet
+        controlnet_image: Optional[PipelineImageInput] = None,
+        controlnet_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # Preprocess controlnet image if provided
+        do_controlnet = controlnet_image is not None and hasattr(self, "controlnet")
+        if do_controlnet:
+            controlnet_image = self.prepare_control_image(
+                image=controlnet_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=1,  # NOTE: always 1 for control images
+                device=device,
+                dtype=latents.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+            controlnet_image = controlnet_image.to(device=device, dtype=latents.dtype)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                unet_add_kwargs = {}
+                # Do controlnet if provided
+                if do_controlnet:
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        guess_mode=False,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )
+                    unet_add_kwargs.update(
+                        {
+                            "down_block_additional_residuals": down_block_res_samples,
+                            "mid_block_additional_residual": mid_block_res_sample,
+                        }
+                    )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs={
+                        "mv_scale": mv_scale,
+                        **(self.cross_attention_kwargs or {}),
+                    },
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    **unet_add_kwargs,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    add_text_embeds = callback_outputs.pop(
+                        "add_text_embeds", add_text_embeds
+                    )
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop(
+                        "negative_add_time_ids", negative_add_time_ids
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = (
+                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            )
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(
+                    next(iter(self.vae.post_quant_conv.parameters())).dtype
+                )
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae.config, "latents_mean")
+                and self.vae.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae.config, "latents_std")
+                and self.vae.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / self.vae.config.scaling_factor
+                    + latents_mean
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+        # set custom attn processor for multi-view attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/schedulers/scheduler_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+def get_sigmas(noise_scheduler, timesteps, n_dim=4, dtype=torch.float32, device=None):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def SNR_to_betas(snr):
+    """
+    Converts SNR to betas
+    """
+    # alphas_cumprod = pass
+    # snr = (alpha / ) ** 2
+    # alpha_t^2 / (1 - alpha_t^2) = snr
+    alpha_t = (snr / (1 + snr)) ** 0.5
+    alphas_cumprod = alpha_t**2
+    alphas = alphas_cumprod / torch.cat(
+        [torch.ones(1, device=snr.device), alphas_cumprod[:-1]]
+    )
+    betas = 1 - alphas
+    return betas
+def compute_snr(timesteps, noise_scheduler):
+    """
+    Computes SNR as per Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+    # Expand the tensors.
+    # Adapted from Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+        device=timesteps.device
+    )[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+def compute_alpha(timesteps, noise_scheduler):
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    return alpha

mvadapter/schedulers/scheduling_shift_snr.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from typing import Any
+import torch
+from .scheduler_utils import SNR_to_betas, compute_snr
+class ShiftSNRScheduler:
+    def __init__(
+        self,
+        noise_scheduler: Any,
+        timesteps: Any,
+        shift_scale: float,
+        scheduler_class: Any,
+    ):
+        self.noise_scheduler = noise_scheduler
+        self.timesteps = timesteps
+        self.shift_scale = shift_scale
+        self.scheduler_class = scheduler_class
+    def _get_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas.
+        :return: A scheduler object configured with shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_betas = SNR_to_betas(snr / self.shift_scale)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    def _get_interpolated_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas and interpolate with the original betas in log space.
+        :return: A scheduler object configured with interpolated shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_snr = snr / self.shift_scale
+        weighting = self.timesteps.float() / (
+            self.noise_scheduler.config.num_train_timesteps - 1
+        )
+        interpolated_snr = torch.exp(
+            torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting
+        )
+        shifted_betas = SNR_to_betas(interpolated_snr)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    @classmethod
+    def from_scheduler(
+        cls,
+        noise_scheduler: Any,
+        shift_mode: str = "default",
+        timesteps: Any = None,
+        shift_scale: float = 1.0,
+        scheduler_class: Any = None,
+    ):
+        # Check input
+        if timesteps is None:
+            timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps)
+        if scheduler_class is None:
+            scheduler_class = noise_scheduler.__class__
+        # Create scheduler
+        shift_scheduler = cls(
+            noise_scheduler=noise_scheduler,
+            timesteps=timesteps,
+            shift_scale=shift_scale,
+            scheduler_class=scheduler_class,
+        )
+        if shift_mode == "default":
+            return shift_scheduler._get_shift_scheduler()
+        elif shift_mode == "interpolated":
+            return shift_scheduler._get_interpolated_shift_scheduler()
+        else:
+            raise ValueError(f"Unknown shift_mode: {shift_mode}")
+if __name__ == "__main__":
+    """
+    Compare the alpha values for different noise schedulers.
+    """
+    import matplotlib.pyplot as plt
+    from diffusers import DDPMScheduler
+    from .scheduler_utils import compute_alpha
+    # Base
+    timesteps = torch.arange(0, 1000)
+    noise_scheduler_base = DDPMScheduler.from_pretrained(
+        "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_base)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Base")
+    # Kolors
+    num_train_timesteps_ = 1100
+    timesteps_ = torch.arange(0, num_train_timesteps_)
+    noise_kwargs = {"beta_end": 0.014, "num_train_timesteps": num_train_timesteps_}
+    noise_scheduler_kolors = DDPMScheduler.from_config(
+        noise_scheduler_base.config, **noise_kwargs
+    )
+    alpha = compute_alpha(timesteps_, noise_scheduler_kolors)
+    plt.plot(timesteps_.numpy(), alpha.numpy(), label="Kolors")
+    # Shift betas
+    shift_scale = 8.0
+    noise_scheduler_shift = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="default", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_shift)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Shift Noise (scale 8.0)")
+    # Shift betas (interpolated)
+    noise_scheduler_inter = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="interpolated", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_inter)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Interpolated (scale 8.0)")
+    # ZeroSNR
+    noise_scheduler = DDPMScheduler.from_config(
+        noise_scheduler_base.config, rescale_betas_zero_snr=True
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="ZeroSNR")
+    plt.legend()
+    plt.grid()
+    plt.savefig("check_alpha.png")

mvadapter/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .camera import get_camera, get_orthogonal_camera
+from .geometry import get_plucker_embeds_from_cameras_ortho
+from .saving import make_image_grid, tensor_to_image

mvadapter/utils/camera.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+# import trimesh
+from PIL import Image
+from torch import BoolTensor, FloatTensor
+LIST_TYPE = Union[list, np.ndarray, torch.Tensor]
+def list_to_pt(
+    x: LIST_TYPE, dtype: Optional[torch.dtype] = None, device: Optional[str] = None
+) -> torch.Tensor:
+    if isinstance(x, list) or isinstance(x, np.ndarray):
+        return torch.tensor(x, dtype=dtype, device=device)
+    return x.to(dtype=dtype)
+def get_c2w(
+    elevation_deg: LIST_TYPE,
+    distance: LIST_TYPE,
+    azimuth_deg: Optional[LIST_TYPE],
+    num_views: Optional[int] = 1,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    if azimuth_deg is None:
+        assert (
+            num_views is not None
+        ), "num_views must be provided if azimuth_deg is None."
+        azimuth_deg = torch.linspace(
+            0, 360, num_views + 1, dtype=torch.float32, device=device
+        )[:-1]
+    else:
+        num_views = len(azimuth_deg)
+    azimuth_deg = list_to_pt(azimuth_deg, dtype=torch.float32, device=device)
+    elevation_deg = list_to_pt(elevation_deg, dtype=torch.float32, device=device)
+    camera_distances = list_to_pt(distance, dtype=torch.float32, device=device)
+    elevation = elevation_deg * math.pi / 180
+    azimuth = azimuth_deg * math.pi / 180
+    camera_positions = torch.stack(
+        [
+            camera_distances * torch.cos(elevation) * torch.cos(azimuth),
+            camera_distances * torch.cos(elevation) * torch.sin(azimuth),
+            camera_distances * torch.sin(elevation),
+        ],
+        dim=-1,
+    )
+    center = torch.zeros_like(camera_positions)
+    up = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)[None, :].repeat(
+        num_views, 1
+    )
+    lookat = F.normalize(center - camera_positions, dim=-1)
+    right = F.normalize(torch.cross(lookat, up, dim=-1), dim=-1)
+    up = F.normalize(torch.cross(right, lookat, dim=-1), dim=-1)
+    c2w3x4 = torch.cat(
+        [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]],
+        dim=-1,
+    )
+    c2w = torch.cat([c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1)
+    c2w[:, 3, 3] = 1.0
+    return c2w
+def get_projection_matrix(
+    fovy_deg: LIST_TYPE,
+    aspect_wh: float = 1.0,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    fovy_deg = list_to_pt(fovy_deg, dtype=torch.float32, device=device)
+    batch_size = fovy_deg.shape[0]
+    fovy = fovy_deg * math.pi / 180
+    tan_half_fovy = torch.tan(fovy / 2)
+    projection_matrix = torch.zeros(
+        batch_size, 4, 4, dtype=torch.float32, device=device
+    )
+    projection_matrix[:, 0, 0] = 1 / (aspect_wh * tan_half_fovy)
+    projection_matrix[:, 1, 1] = -1 / tan_half_fovy
+    projection_matrix[:, 2, 2] = -(far + near) / (far - near)
+    projection_matrix[:, 2, 3] = -2 * far * near / (far - near)
+    projection_matrix[:, 3, 2] = -1
+    return projection_matrix
+def get_orthogonal_projection_matrix(
+    batch_size: int,
+    left: float,
+    right: float,
+    bottom: float,
+    top: float,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    projection_matrix = torch.zeros(
+        batch_size, 4, 4, dtype=torch.float32, device=device
+    )
+    projection_matrix[:, 0, 0] = 2 / (right - left)
+    projection_matrix[:, 1, 1] = -2 / (top - bottom)
+    projection_matrix[:, 2, 2] = -2 / (far - near)
+    projection_matrix[:, 0, 3] = -(right + left) / (right - left)
+    projection_matrix[:, 1, 3] = -(top + bottom) / (top - bottom)
+    projection_matrix[:, 2, 3] = -(far + near) / (far - near)
+    projection_matrix[:, 3, 3] = 1
+    return projection_matrix
+@dataclass
+class Camera:
+    c2w: Optional[torch.FloatTensor]
+    w2c: torch.FloatTensor
+    proj_mtx: torch.FloatTensor
+    mvp_mtx: torch.FloatTensor
+    cam_pos: Optional[torch.FloatTensor]
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            sl = slice(index, index + 1)
+        elif isinstance(index, slice):
+            sl = index
+        else:
+            raise NotImplementedError
+        return Camera(
+            c2w=self.c2w[sl] if self.c2w is not None else None,
+            w2c=self.w2c[sl],
+            proj_mtx=self.proj_mtx[sl],
+            mvp_mtx=self.mvp_mtx[sl],
+            cam_pos=self.cam_pos[sl] if self.cam_pos is not None else None,
+        )
+    def to(self, device: Optional[str] = None):
+        if self.c2w is not None:
+            self.c2w = self.c2w.to(device)
+        self.w2c = self.w2c.to(device)
+        self.proj_mtx = self.proj_mtx.to(device)
+        self.mvp_mtx = self.mvp_mtx.to(device)
+        if self.cam_pos is not None:
+            self.cam_pos = self.cam_pos.to(device)
+    def __len__(self):
+        return self.c2w.shape[0]
+def get_camera(
+    elevation_deg: Optional[LIST_TYPE] = None,
+    distance: Optional[LIST_TYPE] = None,
+    fovy_deg: Optional[LIST_TYPE] = None,
+    azimuth_deg: Optional[LIST_TYPE] = None,
+    num_views: Optional[int] = 1,
+    c2w: Optional[torch.FloatTensor] = None,
+    w2c: Optional[torch.FloatTensor] = None,
+    proj_mtx: Optional[torch.FloatTensor] = None,
+    aspect_wh: float = 1.0,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+):
+    if w2c is None:
+        if c2w is None:
+            c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device)
+        camera_positions = c2w[:, :3, 3]
+        w2c = torch.linalg.inv(c2w)
+    else:
+        camera_positions = None
+        c2w = None
+    if proj_mtx is None:
+        proj_mtx = get_projection_matrix(
+            fovy_deg, aspect_wh=aspect_wh, near=near, far=far, device=device
+        )
+    mvp_mtx = proj_mtx @ w2c
+    return Camera(
+        c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions
+    )
+def get_orthogonal_camera(
+    elevation_deg: LIST_TYPE,
+    distance: LIST_TYPE,
+    left: float,
+    right: float,
+    bottom: float,
+    top: float,
+    azimuth_deg: Optional[LIST_TYPE] = None,
+    num_views: Optional[int] = 1,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+):
+    c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device)
+    camera_positions = c2w[:, :3, 3]
+    w2c = torch.linalg.inv(c2w)
+    proj_mtx = get_orthogonal_projection_matrix(
+        batch_size=c2w.shape[0],
+        left=left,
+        right=right,
+        bottom=bottom,
+        top=top,
+        near=near,
+        far=far,
+        device=device,
+    )
+    mvp_mtx = proj_mtx @ w2c
+    return Camera(
+        c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions
+    )

mvadapter/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from torch.nn import functional as F
+def get_position_map_from_depth(depth, mask, intrinsics, extrinsics, image_wh=None):
+    """Compute the position map from the depth map and the camera parameters for a batch of views.
+    Args:
+        depth (torch.Tensor): The depth maps with the shape (B, H, W, 1).
+        mask (torch.Tensor): The masks with the shape (B, H, W, 1).
+        intrinsics (torch.Tensor): The camera intrinsics matrices with the shape (B, 3, 3).
+        extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4).
+        image_wh (Tuple[int, int]): The image width and height.
+    Returns:
+        torch.Tensor: The position maps with the shape (B, H, W, 3).
+    """
+    if image_wh is None:
+        image_wh = depth.shape[2], depth.shape[1]
+    B, H, W, _ = depth.shape
+    depth = depth.squeeze(-1)
+    u_coord, v_coord = torch.meshgrid(
+        torch.arange(image_wh[0]), torch.arange(image_wh[1]), indexing="xy"
+    )
+    u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    # Compute the position map by back-projecting depth pixels to 3D space
+    x = (
+        (u_coord - intrinsics[:, 0, 2].unsqueeze(-1).unsqueeze(-1))
+        * depth
+        / intrinsics[:, 0, 0].unsqueeze(-1).unsqueeze(-1)
+    )
+    y = (
+        (v_coord - intrinsics[:, 1, 2].unsqueeze(-1).unsqueeze(-1))
+        * depth
+        / intrinsics[:, 1, 1].unsqueeze(-1).unsqueeze(-1)
+    )
+    z = depth
+    # Concatenate to form the 3D coordinates in the camera frame
+    camera_coords = torch.stack([x, y, z], dim=-1)
+    # Apply the extrinsic matrix to get coordinates in the world frame
+    coords_homogeneous = torch.nn.functional.pad(
+        camera_coords, (0, 1), "constant", 1.0
+    )  # Add a homogeneous coordinate
+    world_coords = torch.matmul(
+        coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2)
+    ).view(B, H, W, 4)
+    # Apply the mask to the position map
+    position_map = world_coords[..., :3] * mask
+    return position_map
+def get_position_map_from_depth_ortho(
+    depth, mask, extrinsics, ortho_scale, image_wh=None
+):
+    """Compute the position map from the depth map and the camera parameters for a batch of views
+    using orthographic projection with a given ortho_scale.
+    Args:
+        depth (torch.Tensor): The depth maps with the shape (B, H, W, 1).
+        mask (torch.Tensor): The masks with the shape (B, H, W, 1).
+        extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4).
+        ortho_scale (torch.Tensor): The scaling factor for the orthographic projection with the shape (B, 1, 1, 1).
+        image_wh (Tuple[int, int]): Optional. The image width and height.
+    Returns:
+        torch.Tensor: The position maps with the shape (B, H, W, 3).
+    """
+    if image_wh is None:
+        image_wh = depth.shape[2], depth.shape[1]
+    B, H, W, _ = depth.shape
+    depth = depth.squeeze(-1)
+    # Generating grid of coordinates in the image space
+    u_coord, v_coord = torch.meshgrid(
+        torch.arange(0, image_wh[0]), torch.arange(0, image_wh[1]), indexing="xy"
+    )
+    u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    # Compute the position map using orthographic projection with ortho_scale
+    x = (u_coord - image_wh[0] / 2) / ortho_scale / image_wh[0]
+    y = (v_coord - image_wh[1] / 2) / ortho_scale / image_wh[1]
+    z = depth
+    # Concatenate to form the 3D coordinates in the camera frame
+    camera_coords = torch.stack([x, y, z], dim=-1)
+    # Apply the extrinsic matrix to get coordinates in the world frame
+    coords_homogeneous = torch.nn.functional.pad(
+        camera_coords, (0, 1), "constant", 1.0
+    )  # Add a homogeneous coordinate
+    world_coords = torch.matmul(
+        coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2)
+    ).view(B, H, W, 4)
+    # Apply the mask to the position map
+    position_map = world_coords[..., :3] * mask
+    return position_map
+def get_opencv_from_blender(matrix_world, fov=None, image_size=None):
+    # convert matrix_world to opencv format extrinsics
+    opencv_world_to_cam = matrix_world.inverse()
+    opencv_world_to_cam[1, :] *= -1
+    opencv_world_to_cam[2, :] *= -1
+    R, T = opencv_world_to_cam[:3, :3], opencv_world_to_cam[:3, 3]
+    if fov is None:  # orthographic camera
+        return R, T
+    R, T = R.unsqueeze(0), T.unsqueeze(0)
+    # convert fov to opencv format intrinsics
+    focal = 1 / np.tan(fov / 2)
+    intrinsics = np.diag(np.array([focal, focal, 1])).astype(np.float32)
+    opencv_cam_matrix = (
+        torch.from_numpy(intrinsics).unsqueeze(0).float().to(matrix_world.device)
+    )
+    opencv_cam_matrix[:, :2, -1] += torch.tensor([image_size / 2, image_size / 2]).to(
+        matrix_world.device
+    )
+    opencv_cam_matrix[:, [0, 1], [0, 1]] *= image_size / 2
+    return R, T, opencv_cam_matrix
+def get_ray_directions(
+    H: int,
+    W: int,
+    focal: float,
+    principal: Optional[Tuple[float, float]] = None,
+    use_pixel_centers: bool = True,
+) -> torch.Tensor:
+    """
+    Get ray directions for all pixels in camera coordinate.
+    Args:
+        H, W, focal, principal, use_pixel_centers: image height, width, focal length, principal point and whether use pixel centers
+    Outputs:
+        directions: (H, W, 3), the direction of the rays in camera coordinate
+    """
+    pixel_center = 0.5 if use_pixel_centers else 0
+    cx, cy = W / 2, H / 2 if principal is None else principal
+    i, j = torch.meshgrid(
+        torch.arange(W, dtype=torch.float32) + pixel_center,
+        torch.arange(H, dtype=torch.float32) + pixel_center,
+        indexing="xy",
+    )
+    directions = torch.stack(
+        [(i - cx) / focal, -(j - cy) / focal, -torch.ones_like(i)], -1
+    )
+    return F.normalize(directions, dim=-1)
+def get_rays(
+    directions: torch.Tensor, c2w: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Get ray origins and directions from camera coordinates to world coordinates
+    Args:
+        directions: (H, W, 3) ray directions in camera coordinates
+        c2w: (4, 4) camera-to-world transformation matrix
+    Outputs:
+        rays_o, rays_d: (H, W, 3) ray origins and directions in world coordinates
+    """
+    # Rotate ray directions from camera coordinate to the world coordinate
+    rays_d = directions @ c2w[:3, :3].T
+    rays_o = c2w[:3, 3].expand(rays_d.shape)
+    return rays_o, rays_d
+def compute_plucker_embed(
+    c2w: torch.Tensor, image_width: int, image_height: int, focal: float
+) -> torch.Tensor:
+    """
+    Computes Plucker coordinates for a camera.
+    Args:
+        c2w: (4, 4) camera-to-world transformation matrix
+        image_width: Image width
+        image_height: Image height
+        focal: Focal length of the camera
+    Returns:
+        plucker: (6, H, W) Plucker embedding
+    """
+    directions = get_ray_directions(image_height, image_width, focal)
+    rays_o, rays_d = get_rays(directions, c2w)
+    # Cross product to get Plucker coordinates
+    cross = torch.cross(rays_o, rays_d, dim=-1)
+    plucker = torch.cat((rays_d, cross), dim=-1)
+    return plucker.permute(2, 0, 1)
+def get_plucker_embeds_from_cameras(
+    c2w: List[torch.Tensor], fov: List[float], image_size: int
+) -> torch.Tensor:
+    """
+    Given lists of camera transformations and fov, returns the batched plucker embeddings.
+    Args:
+        c2w: list of camera-to-world transformation matrices
+        fov: list of field of view values
+        image_size: size of the image
+    Returns:
+        plucker_embeds: (B, 6, H, W) batched plucker embeddings
+    """
+    plucker_embeds = []
+    for cam_matrix, cam_fov in zip(c2w, fov):
+        focal = 0.5 * image_size / np.tan(0.5 * cam_fov)
+        plucker = compute_plucker_embed(cam_matrix, image_size, image_size, focal)
+        plucker_embeds.append(plucker)
+    return torch.stack(plucker_embeds)
+def get_plucker_embeds_from_cameras_ortho(
+    c2w: List[torch.Tensor], ortho_scale: List[float], image_size: int
+):
+    """
+    Given lists of camera transformations and fov, returns the batched plucker embeddings.
+    Parameters:
+        c2w: list of camera-to-world transformation matrices
+        fov: list of field of view values
+        image_size: size of the image
+    Returns:
+        plucker_embeds: plucker embeddings (B, 6, H, W)
+    """
+    plucker_embeds = []
+    # compute pairwise mask and plucker embeddings
+    for cam_matrix, scale in zip(c2w, ortho_scale):
+        # blender to opencv to pytorch3d
+        R, T = get_opencv_from_blender(cam_matrix)
+        cam_pos = -R.T @ T
+        view_dir = R.T @ torch.tensor([0, 0, 1]).float().to(cam_matrix.device)
+        # normalize camera position
+        cam_pos = F.normalize(cam_pos, dim=0)
+        plucker = torch.concat([view_dir, cam_pos])
+        plucker = plucker.unsqueeze(-1).unsqueeze(-1).repeat(1, image_size, image_size)
+        plucker_embeds.append(plucker)
+    plucker_embeds = torch.stack(plucker_embeds)
+    return plucker_embeds

mvadapter/utils/saving.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import math
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+def tensor_to_image(
+    data: Union[Image.Image, torch.Tensor, np.ndarray],
+    batched: bool = False,
+    format: str = "HWC",
+) -> Union[Image.Image, List[Image.Image]]:
+    if isinstance(data, Image.Image):
+        return data
+    if isinstance(data, torch.Tensor):
+        data = data.detach().cpu().numpy()
+    if data.dtype == np.float32 or data.dtype == np.float16:
+        data = (data * 255).astype(np.uint8)
+    elif data.dtype == np.bool_:
+        data = data.astype(np.uint8) * 255
+    assert data.dtype == np.uint8
+    if format == "CHW":
+        if batched and data.ndim == 4:
+            data = data.transpose((0, 2, 3, 1))
+        elif not batched and data.ndim == 3:
+            data = data.transpose((1, 2, 0))
+    if batched:
+        return [Image.fromarray(d) for d in data]
+    return Image.fromarray(data)
+def largest_factor_near_sqrt(n: int) -> int:
+    """
+    Finds the largest factor of n that is closest to the square root of n.
+    Args:
+        n (int): The integer for which to find the largest factor near its square root.
+    Returns:
+        int: The largest factor of n that is closest to the square root of n.
+    """
+    sqrt_n = int(math.sqrt(n))  # Get the integer part of the square root
+    # First, check if the square root itself is a factor
+    if sqrt_n * sqrt_n == n:
+        return sqrt_n
+    # Otherwise, find the largest factor by iterating from sqrt_n downwards
+    for i in range(sqrt_n, 0, -1):
+        if n % i == 0:
+            return i
+    # If n is 1, return 1
+    return 1
+def make_image_grid(
+    images: List[Image.Image],
+    rows: Optional[int] = None,
+    cols: Optional[int] = None,
+    resize: Optional[int] = None,
+) -> Image.Image:
+    """
+    Prepares a single grid of images. Useful for visualization purposes.
+    """
+    if rows is None and cols is not None:
+        assert len(images) % cols == 0
+        rows = len(images) // cols
+    elif cols is None and rows is not None:
+        assert len(images) % rows == 0
+        cols = len(images) // rows
+    elif rows is None and cols is None:
+        rows = largest_factor_near_sqrt(len(images))
+        cols = len(images) // rows
+    assert len(images) == rows * cols
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid