:sparkles: finish inference

Files changed (5) hide show

.gitignore +1 -0
scripts/README.md +0 -1
scripts/convert_mvdream_to_diffusers.py +17 -13
scripts/models.py +11 -0
scripts/pipeline_mvdream.py +186 -130

.gitignore CHANGED Viewed

@@ -2,3 +2,4 @@
 *.yaml
 converted
 __pycache__

 *.yaml
 converted
 __pycache__
+*.png

scripts/README.md CHANGED Viewed

@@ -14,6 +14,5 @@ wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd
 Hugging Face diffusers weights are converted by script:
 ```bash
-mkdir converted
 python ./scripts/convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v1.5-4view.pt --dump_path ./converted --original_config_file ./sd-v1.yaml
 ```

 Hugging Face diffusers weights are converted by script:
 ```bash
 python ./scripts/convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v1.5-4view.pt --dump_path ./converted --original_config_file ./sd-v1.yaml
 ```

scripts/convert_mvdream_to_diffusers.py CHANGED Viewed

@@ -20,7 +20,7 @@ from diffusers.utils import logging
 from accelerate import init_empty_weights
 from accelerate.utils import set_module_tensor_to_device
 from rich import print, print_json
-from models import MultiViewUNetModel
 from pipeline_mvdream import MVDreamStableDiffusionPipeline
 from transformers import CLIPTokenizer, CLIPTextModel
@@ -659,7 +659,6 @@ def conv_attn_to_linear(checkpoint):
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0]
 def convert_from_original_mvdream_ckpt(
     checkpoint_path,
     original_config_file,
@@ -667,13 +666,13 @@ def convert_from_original_mvdream_ckpt(
     device
 ):
     checkpoint = torch.load(checkpoint_path, map_location=device)
-    print(f"Checkpoint: {checkpoint.keys()}")
     torch.cuda.empty_cache()
     from omegaconf import OmegaConf
     original_config = OmegaConf.load(original_config_file)
-    print(f"Original Config: {original_config}")
     prediction_type = "epsilon"
     image_size = 256
     num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
@@ -700,10 +699,11 @@ def convert_from_original_mvdream_ckpt(
     # converted_unet_checkpoint = convert_ldm_unet_checkpoint(
     #     checkpoint, unet_config, path=None, extract_ema=extract_ema
     # )
-    print(f"Unet Config: {original_config.model.params.unet_config.params}")
-    unet: MultiViewUNetModel = MultiViewUNetModel(**original_config.model.params.unet_config.params)
     unet.load_state_dict({
-        key.replace("model.diffusion_model.", ""): value for key, value in checkpoint.items() if key.replace("model.diffusion_model.", "") in unet.state_dict()
     })
     for param_name, param in unet.state_dict().items():
         set_module_tensor_to_device(unet, param_name, "cuda:0", value=param)
@@ -738,9 +738,6 @@ def convert_from_original_mvdream_ckpt(
         tokenizer=tokenizer,
         text_encoder=text_encoder,
         scheduler=scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False
     )
     return pipe
@@ -787,8 +784,15 @@ if __name__ == "__main__":
     if args.half:
         pipe.to(torch_dtype=torch.float16)
-    out = pipe()
-    assert False
     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

 from accelerate import init_empty_weights
 from accelerate.utils import set_module_tensor_to_device
 from rich import print, print_json
+from models import MultiViewUNetModel, MultiViewUNetWrapperModel
 from pipeline_mvdream import MVDreamStableDiffusionPipeline
 from transformers import CLIPTokenizer, CLIPTextModel
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0]
 def convert_from_original_mvdream_ckpt(
     checkpoint_path,
     original_config_file,
     device
 ):
     checkpoint = torch.load(checkpoint_path, map_location=device)
+    # print(f"Checkpoint: {checkpoint.keys()}")
     torch.cuda.empty_cache()
     from omegaconf import OmegaConf
     original_config = OmegaConf.load(original_config_file)
+    # print(f"Original Config: {original_config}")
     prediction_type = "epsilon"
     image_size = 256
     num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
     # converted_unet_checkpoint = convert_ldm_unet_checkpoint(
     #     checkpoint, unet_config, path=None, extract_ema=extract_ema
     # )
+    # print(f"Unet Config: {original_config.model.params.unet_config.params}")
+    unet: MultiViewUNetWrapperModel = MultiViewUNetWrapperModel(**original_config.model.params.unet_config.params)
+    # print(f"Unet State Dict: {unet.state_dict().keys()}")
     unet.load_state_dict({
+        key.replace("model.diffusion_model.", "unet."): value for key, value in checkpoint.items() if key.replace("model.diffusion_model.", "unet.") in unet.state_dict()
     })
     for param_name, param in unet.state_dict().items():
         set_module_tensor_to_device(unet, param_name, "cuda:0", value=param)
         tokenizer=tokenizer,
         text_encoder=text_encoder,
         scheduler=scheduler,
     )
     return pipe
     if args.half:
         pipe.to(torch_dtype=torch.float16)
+    images = pipe(
+        prompt="Head of Hatsune Miku",
+        negative_prompt="painting, bad quality, flat",
+        output_type="pil",
+        return_dict=False,
+        guidance_scale=7.5,
+        num_inference_steps=50,
+    )
+    for i, image in enumerate(images):
+        image.save(f"image_{i}.png")
     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

scripts/models.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from abc import abstractmethod
 import math
 import numpy as np
 import torch as th
@@ -18,6 +19,16 @@ from util import (
 from attention import SpatialTransformer, SpatialTransformer3D, exists
 # dummy replace
 def convert_module_to_f16(x):
     pass

 from abc import abstractmethod
 import math
+from typing import Any, Mapping
 import numpy as np
 import torch as th
 from attention import SpatialTransformer, SpatialTransformer3D, exists
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.models.modeling_utils import ModelMixin
+class MultiViewUNetWrapperModel(ModelMixin, ConfigMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.unet: MultiViewUNetModel = MultiViewUNetModel(*args, **kwargs)
+    def forward(self, *args, **kwargs):
+        return self.unet(*args, **kwargs)
 # dummy replace
 def convert_module_to_f16(x):
     pass

scripts/pipeline_mvdream.py CHANGED Viewed

@@ -2,11 +2,10 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
-from packaging import version
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-from diffusers import AutoencoderKL, UNet2DConditionModel, DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
 from diffusers.utils import (
     deprecate,
     is_accelerate_available,
@@ -16,26 +15,21 @@ from diffusers.utils import (
 )
 try:
-    from diffusers import randn_tensor # old import
 except ImportError:
-    from diffusers.utils.torch_utils import randn_tensor # new import
 from diffusers.configuration_utils import FrozenDict
-import PIL
 import numpy as np
-import kornia
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.models.modeling_utils import ModelMixin
-from models import MultiViewUNetModel
 from diffusers.schedulers import DDIMScheduler
 EXAMPLE_DOC_STRING = ""
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
 import numpy as np
 def create_camera_to_world_matrix(elevation, azimuth):
     elevation = np.radians(elevation)
     azimuth = np.radians(azimuth)
@@ -43,12 +37,12 @@ def create_camera_to_world_matrix(elevation, azimuth):
     x = np.cos(elevation) * np.sin(azimuth)
     y = np.sin(elevation)
     z = np.cos(elevation) * np.cos(azimuth)
     # Calculate camera position, target, and up vectors
     camera_pos = np.array([x, y, z])
     target = np.array([0, 0, 0])
     up = np.array([0, 1, 0])
     # Construct view matrix
     forward = target - camera_pos
     forward /= np.linalg.norm(forward)
@@ -61,90 +55,96 @@ def create_camera_to_world_matrix(elevation, azimuth):
     cam2world[:3, 3] = camera_pos
     return cam2world
 def convert_opengl_to_blender(camera_matrix):
     if isinstance(camera_matrix, np.ndarray):
         # Construct transformation matrix to convert from OpenGL space to Blender space
-        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
         camera_matrix_blender = np.dot(flip_yz, camera_matrix)
     else:
         # Construct transformation matrix to convert from OpenGL space to Blender space
-        flip_yz = torch.tensor([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
         if camera_matrix.ndim == 3:
             flip_yz = flip_yz.unsqueeze(0)
-        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
     return camera_matrix_blender
-def get_camera(num_frames, elevation=15, azimuth_start=0, azimuth_span=360, blender_coord=True):
     angle_gap = azimuth_span / num_frames
     cameras = []
-    for azimuth in np.arange(azimuth_start, azimuth_span+azimuth_start, angle_gap):
         camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
         if blender_coord:
             camera_matrix = convert_opengl_to_blender(camera_matrix)
         cameras.append(camera_matrix.flatten())
     return torch.tensor(np.stack(cameras, 0)).float()
 class MVDreamStableDiffusionPipeline(DiffusionPipeline):
     def __init__(
         self,
         vae: AutoencoderKL,
-        unet: MultiViewUNetModel,
         tokenizer: CLIPTokenizer,
         text_encoder: CLIPTextModel,
         scheduler: DDIMScheduler,
-        safety_checker: Optional[StableDiffusionSafetyChecker],
-        feature_extractor: Optional[CLIPFeatureExtractor],
-        requires_safety_checker: bool = True,
     ):
         super().__init__()
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                                   f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                                   "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                                   " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                                   " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                                   " file")
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                                   " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                                   " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                                   " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                                   " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file")
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                           " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                           " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                           " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                           " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                           " information, please have a look at https://github.com/huggingface/diffusers/pull/254 .")
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError("Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                             " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead.")
         self.register_modules(
             vae=vae,
             unet=unet,
             scheduler=scheduler,
             tokenizer=tokenizer,
             text_encoder=text_encoder,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
         )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        # self.model_mode = None
     def enable_vae_slicing(self):
         r"""
@@ -189,20 +189,20 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             cpu_offload(cpu_offloaded_model, device)
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -210,23 +210,26 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
             from accelerate import cpu_offload_with_hook
         else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
         hook = None
         for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
         # We'll offload the last model manually.
         self.final_offload_hook = hook
@@ -241,7 +244,9 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
-            if (hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device") and module._hf_hook.execution_device is not None):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
@@ -295,14 +300,21 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
-                logger.warning("The following part of your input was truncated because CLIP can only handle sequences up to"
-                               f" {self.tokenizer.model_max_length} tokens: {removed_text}")
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask.to(device)
             else:
                 attention_mask = None
@@ -313,12 +325,14 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             )
             prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -326,14 +340,16 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
-                raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                                f" {type(prompt)}.")
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
-                raise ValueError(f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                                 f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                                 " the batch size of `prompt`.")
             else:
                 uncond_tokens = negative_prompt
@@ -346,7 +362,8 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
                 return_tensors="pt",
             )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask.to(device)
             else:
                 attention_mask = None
@@ -361,10 +378,13 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -373,14 +393,6 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         return prompt_embeds
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values.to(dtype))
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
@@ -395,25 +407,42 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                             f" size of {batch_size}. Make sure the batch size matches the length of the generators.")
         if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
@@ -433,10 +462,12 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         negative_prompt: str = "bad quality",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: float = 1.0,
@@ -514,9 +545,9 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         # 0. Default height and width to unet
         batch_size = 4
         device = torch.device("cuda:0")
-        camera = get_camera(4).to(device=device)
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
@@ -525,14 +556,15 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
-        prompt_embeds: torch.Tensor = self._encode_prompt(
             prompt=prompt,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=True,
             negative_prompt=negative_prompt,
-        ) # type: ignore
         # 5. Prepare latent variables
         latents: torch.Tensor = self.prepare_latents(
@@ -540,44 +572,65 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             4,
             height,
             width,
-            prompt_embeds.dtype,
             device,
             generator,
             None,
         )
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
-                prompt_embeds = torch.cat([prompt_embeds] * 4)
-                print(f"shape of latent_model_input: {latent_model_input.shape}") # [2*4, 4, 32, 32]
-                print(f"shape of prompt_embeds: {prompt_embeds.shape}") # [2*4, 77, 768]
-                print(f"shape of camera: {camera.shape}") # [4, 16]
-                noise_pred = self.unet.forward(x=latent_model_input, timesteps=torch.tensor([t], device=device), context=prompt_embeds, num_frames=4)
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
-                latents: torch.Tensor = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents) # type: ignore
         # 8. Post-processing
         if output_type == "latent":
@@ -592,10 +645,13 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             image = self.decode_latents(latents)
         # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
         if not return_dict:
-            return (image, None)
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)

 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import (
     deprecate,
     is_accelerate_available,
 )
 try:
+    from diffusers import randn_tensor  # old import
 except ImportError:
+    from diffusers.utils.torch_utils import randn_tensor  # new import
 from diffusers.configuration_utils import FrozenDict
 import numpy as np
 from diffusers.schedulers import DDIMScheduler
+from models import MultiViewUNetModel, MultiViewUNetWrapperModel
 EXAMPLE_DOC_STRING = ""
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 import numpy as np
 def create_camera_to_world_matrix(elevation, azimuth):
     elevation = np.radians(elevation)
     azimuth = np.radians(azimuth)
     x = np.cos(elevation) * np.sin(azimuth)
     y = np.sin(elevation)
     z = np.cos(elevation) * np.cos(azimuth)
     # Calculate camera position, target, and up vectors
     camera_pos = np.array([x, y, z])
     target = np.array([0, 0, 0])
     up = np.array([0, 1, 0])
     # Construct view matrix
     forward = target - camera_pos
     forward /= np.linalg.norm(forward)
     cam2world[:3, 3] = camera_pos
     return cam2world
 def convert_opengl_to_blender(camera_matrix):
     if isinstance(camera_matrix, np.ndarray):
         # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0],
+                            [0, 0, 0, 1]])
         camera_matrix_blender = np.dot(flip_yz, camera_matrix)
     else:
         # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = torch.tensor([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0],
+                                [0, 0, 0, 1]])
         if camera_matrix.ndim == 3:
             flip_yz = flip_yz.unsqueeze(0)
+        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix),
+                                             camera_matrix)
     return camera_matrix_blender
+def get_camera(num_frames,
+               elevation=15,
+               azimuth_start=0,
+               azimuth_span=360,
+               blender_coord=True):
     angle_gap = azimuth_span / num_frames
     cameras = []
+    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start,
+                             angle_gap):
         camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
         if blender_coord:
             camera_matrix = convert_opengl_to_blender(camera_matrix)
         cameras.append(camera_matrix.flatten())
     return torch.tensor(np.stack(cameras, 0)).float()
 class MVDreamStableDiffusionPipeline(DiffusionPipeline):
     def __init__(
         self,
         vae: AutoencoderKL,
+        unet: MultiViewUNetWrapperModel,
         tokenizer: CLIPTokenizer,
         text_encoder: CLIPTextModel,
         scheduler: DDIMScheduler,
     ):
         super().__init__()
+        if hasattr(scheduler.config,
+                   "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file")
+            deprecate("steps_offset!=1",
+                      "1.0.0",
+                      deprecation_message,
+                      standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config,
+                   "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set",
+                      "1.0.0",
+                      deprecation_message,
+                      standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
         self.register_modules(
             vae=vae,
             unet=unet,
             scheduler=scheduler,
             tokenizer=tokenizer,
             text_encoder=text_encoder,
         )
+        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) -
+                                    1)
+        self.register_to_config(requires_safety_checker=False)
     def enable_vae_slicing(self):
         r"""
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:
+            raise ImportError(
+                "`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher"
+            )
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache(
+            )  # otherwise we don't see the memory savings (but they probably exist)
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             cpu_offload(cpu_offloaded_model, device)
     def enable_model_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
+        if is_accelerate_available() and is_accelerate_version(
+                ">=", "0.17.0.dev0"):
             from accelerate import cpu_offload_with_hook
         else:
+            raise ImportError(
+                "`enable_model_offload` requires `accelerate v0.17.0` or higher."
+            )
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache(
+            )  # otherwise we don't see the memory savings (but they probably exist)
         hook = None
         for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model,
+                                            device,
+                                            prev_module_hook=hook)
         # We'll offload the last model manually.
         self.final_offload_hook = hook
         if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
+            if (hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt,
+                                             padding="longest",
+                                             return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                    -1] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask"
+                       ) and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask.to(device)
             else:
                 attention_mask = None
             )
             prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype,
+                                         device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt,
+                                           seq_len, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}.")
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`.")
             else:
                 uncond_tokens = negative_prompt
                 return_tensors="pt",
             )
+            if hasattr(self.text_encoder.config, "use_attention_mask"
+                       ) and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask.to(device)
             else:
                 attention_mask = None
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
         return prompt_embeds
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
+    def prepare_latents(self,
+                        batch_size,
+                        num_channels_latents,
+                        height,
+                        width,
+                        dtype,
+                        device,
+                        generator,
+                        latents=None):
+        shape = (batch_size, num_channels_latents,
+                 height // self.vae_scale_factor,
+                 width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
         if latents is None:
+            latents = randn_tensor(shape,
+                                   generator=generator,
+                                   device=device,
+                                   dtype=dtype)
         else:
             latents = latents.to(device)
         negative_prompt: str = "bad quality",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor],
+                                    None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: float = 1.0,
         # 0. Default height and width to unet
         batch_size = 4
         device = torch.device("cuda:0")
+        camera = get_camera(batch_size).to(device=device)
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
+        _: torch.Tensor = self._encode_prompt(
             prompt=prompt,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompt,
+        )  # type: ignore
+        prompt_embeds_neg, prompt_embeds_pos = _.chunk(2)
         # 5. Prepare latent variables
         latents: torch.Tensor = self.prepare_latents(
             4,
             height,
             width,
+            prompt_embeds_pos.dtype,
             device,
             generator,
             None,
         )
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
+        num_warmup_steps = len(
+            timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
+                multiplier = 2 if do_classifier_free_guidance else 1
+                latent_model_input = torch.cat([latents] * multiplier)
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
                 # predict the noise residual
+                # print(
+                #     f"shape of latent_model_input: {latent_model_input.shape}"
+                # )  # [2*4, 4, 32, 32]
+                # print(f"shape of prompt_embeds: {prompt_embeds.shape}"
+                #       )  # [2*4, 77, 768]
+                # print(f"shape of camera: {camera.shape}")  # [4, 16]
+                noise_pred = self.unet.forward(
+                    x=latent_model_input,
+                    timesteps=torch.tensor([t] * 4 * multiplier,
+                                           device=device),
+                    context=torch.cat([prompt_embeds_neg] * 4 +
+                                      [prompt_embeds_pos] * 4),
+                    num_frames=4,
+                    camera=torch.cat([camera] * multiplier),
+                )
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
+                latents: torch.Tensor = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    **extra_step_kwargs,
+                    return_dict=False)[0]
                 # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and
+                    (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)  # type: ignore
         # 8. Post-processing
         if output_type == "latent":
             image = self.decode_latents(latents)
         # Offload last model to CPU
+        if hasattr(
+                self,
+                "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
         if not return_dict:
+            return image
+        return StableDiffusionPipelineOutput(images=image,
+                                             nsfw_content_detected=None)