Spaces:

flamehaze1115
/

Wonder3D-demo

Runtime error

App Files Files Community

flamehaze1115 commited on Oct 25, 2023

Commit

b582ef0

1 Parent(s): 9729d73

first commit

Browse files

Files changed (46) hide show

.gitignore +189 -0
README.md +69 -13
app.py +331 -0
configs/mvdiffusion-joint-ortho-6views.yaml +42 -0
example_images/14_10_29_489_Tiger_1__1.png +0 -0
example_images/box.png +0 -0
example_images/bread.png +0 -0
example_images/cat.png +0 -0
example_images/cat_head.png +0 -0
example_images/chili.png +0 -0
example_images/duola.png +0 -0
example_images/halloween.png +0 -0
example_images/head.png +0 -0
example_images/kettle.png +0 -0
example_images/kunkun.png +0 -0
example_images/milk.png +0 -0
example_images/owl.png +0 -0
example_images/poro.png +0 -0
example_images/pumpkin.png +0 -0
example_images/skull.png +0 -0
example_images/stone.png +0 -0
example_images/teapot.png +0 -0
example_images/tiger-head-3d-model-obj-stl.png +0 -0
mvdiffusion/data/fixed_poses/four_views/000_back_RT.txt +3 -0
mvdiffusion/data/fixed_poses/four_views/000_front_RT.txt +3 -0
mvdiffusion/data/fixed_poses/four_views/000_left_RT.txt +3 -0
mvdiffusion/data/fixed_poses/four_views/000_right_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_back_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_back_left_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_back_right_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_front_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_front_left_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_front_right_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_left_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_right_RT.txt +3 -0
mvdiffusion/data/fixed_poses/nine_views/000_top_RT.txt +3 -0
mvdiffusion/data/normal_utils.py +45 -0
mvdiffusion/data/objaverse_dataset.py +608 -0
mvdiffusion/data/single_image_dataset.py +321 -0
mvdiffusion/models/transformer_mv2d.py +1005 -0
mvdiffusion/models/unet_mv2d_blocks.py +880 -0
mvdiffusion/models/unet_mv2d_condition.py +1462 -0
mvdiffusion/pipelines/pipeline_mvdiffusion_image.py +485 -0
requirements.txt +30 -0
run_test.sh +1 -0
utils/misc.py +54 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,189 @@

+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
+# dependencies
+/transformers
+# ruff
+.ruff_cache
+# ckpts
+*.ckpt
+outputs/*
+NeuS/exp/*
+NeuS/test_scenes/*
+NeuS/mesh2tex/*
+neus_configs
+vast/*
+render_results
+experiments/*
+neus/*
+ckpts/*

README.md CHANGED Viewed

@@ -1,13 +1,69 @@
----
-title: Wonder3D
-emoji: 👁
-colorFrom: indigo
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.50.2
-app_file: app.py
-pinned: false
-license: cc-by-sa-4.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Wonder3D
+Single Image to 3D using Cross-Domain Diffusion
+## [Paper](https://arxiv.org/abs/2310.15008) | [Project page](https://www.xxlong.site/Wonder3D/)
+![](assets/fig_teaser.png)
+Wonder3D reconstructs highly-detailed textured meshes from a single-view image in only 2 ∼ 3 minutes. Wonder3D first generates consistent multi-view normal maps with corresponding color images via a cross-domain diffusion model, and then leverages a novel normal fusion method to achieve fast and high-quality reconstruction.
+## Schedule
+- [x] Inference code and pretrained models.
+- [ ] Huggingface demo.
+- [ ] Training code.
+- [ ] Rendering code for data prepare.
+### Preparation for inference
+1. Install packages in `requirements.txt`.
+```angular2html
+conda create -n wonder3d
+conda activate wonder3d
+pip install -r requirements.txt
+```
+2. Download the [checkpoints](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/xxlong_connect_hku_hk/EgSHPyJAtaJFpV_BjXM3zXwB-UMIrT4v-sQwGgw-coPtIA) into the root folder.
+### Inference
+1. Make sure you have the following models.
+```bash
+Wonder3D
+|-- ckpts
+    |-- unet
+    |-- scheduler.bin
+    ...
+```
+2. Predict foreground mask as the alpha channel. We use [Clipdrop](https://clipdrop.co/remove-background) to segment the foreground object interactively.
+You may also use `rembg` to remove the backgrounds.
+```bash
+# !pip install rembg
+import rembg
+result = rembg.remove(result)
+result.show()
+```
+3. Run Wonder3d to produce multiview-consistent normal maps and color images. Then you can check the results in the folder `./outputs`. (we use rembg to remove backgrounds of the results, but the segmemtations are not always perfect.)
+```bash
+accelerate launch --config_file 1gpu.yaml test_mvdiffusion_seq.py \
+            --config mvdiffusion-joint-ortho-6views.yaml
+```
+or
+```bash
+bash run_test.sh
+```
+4. Mesh Extraction
+```bash
+cd ./instant-nsr-pl
+bash run.sh output_folder_path scene_name
+```
+## Citation
+If you find this repository useful in your project, please cite the following work. :)
+```
+@misc{long2023wonder3d,
+      title={Wonder3D: Single Image to 3D using Cross-Domain Diffusion},
+      author={Xiaoxiao Long and Yuan-Chen Guo and Cheng Lin and Yuan Liu and Zhiyang Dou and Lingjie Liu and Yuexin Ma and Song-Hai Zhang and Marc Habermann and Christian Theobalt and Wenping Wang},
+      year={2023},
+      eprint={2310.15008},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import os
+import sys
+import numpy
+import torch
+import rembg
+import threading
+import urllib.request
+from PIL import Image
+from typing import Dict, Optional, Tuple, List
+from dataclasses import dataclass
+import streamlit as st
+import huggingface_hub
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from mvdiffusion.models.unet_mv2d_condition import UNetMV2DConditionModel
+from mvdiffusion.data.single_image_dataset import SingleImageDataset as MVDiffusionDataset
+from mvdiffusion.pipelines.pipeline_mvdiffusion_image import MVDiffusionImagePipeline
+from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
+@dataclass
+class TestConfig:
+    pretrained_model_name_or_path: str
+    pretrained_unet_path:str
+    revision: Optional[str]
+    validation_dataset: Dict
+    save_dir: str
+    seed: Optional[int]
+    validation_batch_size: int
+    dataloader_num_workers: int
+    local_rank: int
+    pipe_kwargs: Dict
+    pipe_validation_kwargs: Dict
+    unet_from_pretrained_kwargs: Dict
+    validation_guidance_scales: List[float]
+    validation_grid_nrow: int
+    camera_embedding_lr_mult: float
+    num_views: int
+    camera_embedding_type: str
+    pred_type: str  # joint, or ablation
+    enable_xformers_memory_efficient_attention: bool
+    cond_on_normals: bool
+    cond_on_colors: bool
+img_example_counter = 0
+iret_base = 'example_images'
+iret = [
+    dict(rimageinput=os.path.join(iret_base, x), dispi=os.path.join(iret_base, x))
+    for x in sorted(os.listdir(iret_base))
+]
+class SAMAPI:
+    predictor = None
+    @staticmethod
+    @st.cache_resource
+    def get_instance(sam_checkpoint=None):
+        if SAMAPI.predictor is None:
+            if sam_checkpoint is None:
+                sam_checkpoint = "tmp/sam_vit_h_4b8939.pth"
+            if not os.path.exists(sam_checkpoint):
+                os.makedirs('tmp', exist_ok=True)
+                urllib.request.urlretrieve(
+                    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+                    sam_checkpoint
+                )
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            model_type = "default"
+            from segment_anything import sam_model_registry, SamPredictor
+            sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+            sam.to(device=device)
+            predictor = SamPredictor(sam)
+            SAMAPI.predictor = predictor
+        return SAMAPI.predictor
+    @staticmethod
+    def segment_api(rgb, mask=None, bbox=None, sam_checkpoint=None):
+        """
+        Parameters
+        ----------
+        rgb : np.ndarray h,w,3 uint8
+        mask: np.ndarray h,w bool
+        Returns
+        -------
+        """
+        np = numpy
+        predictor = SAMAPI.get_instance(sam_checkpoint)
+        predictor.set_image(rgb)
+        if mask is None and bbox is None:
+            box_input = None
+        else:
+            # mask to bbox
+            if bbox is None:
+                y1, y2, x1, x2 = np.nonzero(mask)[0].min(), np.nonzero(mask)[0].max(), np.nonzero(mask)[1].min(), \
+                                 np.nonzero(mask)[1].max()
+            else:
+                x1, y1, x2, y2 = bbox
+            box_input = np.array([[x1, y1, x2, y2]])
+        masks, scores, logits = predictor.predict(
+            box=box_input,
+            multimask_output=True,
+            return_logits=False,
+        )
+        mask = masks[-1]
+        return mask
+def image_examples(samples, ncols, return_key=None, example_text="Examples"):
+    global img_example_counter
+    trigger = False
+    with st.expander(example_text, True):
+        for i in range(len(samples) // ncols):
+            cols = st.columns(ncols)
+            for j in range(ncols):
+                idx = i * ncols + j
+                if idx >= len(samples):
+                    continue
+                entry = samples[idx]
+                with cols[j]:
+                    st.image(entry['dispi'])
+                    img_example_counter += 1
+                    with st.columns(5)[2]:
+                        this_trigger = st.button('\+', key='imgexuse%d' % img_example_counter)
+                    trigger = trigger or this_trigger
+                    if this_trigger:
+                        trigger = entry[return_key]
+    return trigger
+def segment_img(img: Image):
+    output = rembg.remove(img)
+    mask = numpy.array(output)[:, :, 3] > 0
+    sam_mask = SAMAPI.segment_api(numpy.array(img)[:, :, :3], mask)
+    segmented_img = Image.new("RGBA", img.size, (0, 0, 0, 0))
+    segmented_img.paste(img, mask=Image.fromarray(sam_mask))
+    return segmented_img
+def segment_6imgs(imgs):
+    segmented_imgs = []
+    for i, img in enumerate(imgs):
+        output = rembg.remove(img)
+        mask = numpy.array(output)[:, :, 3]
+        mask = SAMAPI.segment_api(numpy.array(img)[:, :, :3], mask)
+        data = numpy.array(img)[:,:,:3]
+        data[mask == 0] = [255, 255, 255]
+        segmented_imgs.append(data)
+    result = numpy.concatenate([
+        numpy.concatenate([segmented_imgs[0], segmented_imgs[1]], axis=1),
+        numpy.concatenate([segmented_imgs[2], segmented_imgs[3]], axis=1),
+        numpy.concatenate([segmented_imgs[4], segmented_imgs[5]], axis=1)
+    ])
+    return Image.fromarray(result)
+def pack_6imgs(imgs):
+    result = numpy.concatenate([
+        numpy.concatenate([imgs[0], imgs[1]], axis=1),
+        numpy.concatenate([imgs[2], imgs[3]], axis=1),
+        numpy.concatenate([imgs[4], imgs[5]], axis=1)
+    ])
+    return Image.fromarray(result)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+@st.cache_data
+def check_dependencies():
+    reqs = []
+    try:
+        import diffusers
+    except ImportError:
+        import traceback
+        traceback.print_exc()
+        print("Error: `diffusers` not found.", file=sys.stderr)
+        reqs.append("diffusers==0.20.2")
+    else:
+        if not diffusers.__version__.startswith("0.20"):
+            print(
+                f"Warning: You are using an unsupported version of diffusers ({diffusers.__version__}), which may lead to performance issues.",
+                file=sys.stderr
+            )
+            print("Recommended version is `diffusers==0.20.2`.", file=sys.stderr)
+    try:
+        import transformers
+    except ImportError:
+        import traceback
+        traceback.print_exc()
+        print("Error: `transformers` not found.", file=sys.stderr)
+        reqs.append("transformers==4.29.2")
+    if torch.__version__ < '2.0':
+        try:
+            import xformers
+        except ImportError:
+            print("Warning: You are using PyTorch 1.x without a working `xformers` installation.", file=sys.stderr)
+            print("You may see a significant memory overhead when running the model.", file=sys.stderr)
+    if len(reqs):
+        print(f"Info: Fix all dependency errors with `pip install {' '.join(reqs)}`.")
+@st.cache_resource
+def load_wonder3d_pipeline(cfg):
+    # Load scheduler, tokenizer and models.
+    # noise_scheduler = DDPMScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", revision=cfg.revision)
+    feature_extractor = CLIPImageProcessor.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="feature_extractor", revision=cfg.revision)
+    vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", revision=cfg.revision)
+    unet = UNetMV2DConditionModel.from_pretrained_2d(cfg.pretrained_unet_path, subfolder="unet", revision=cfg.revision, **cfg.unet_from_pretrained_kwargs)
+    weight_dtype = torch.float16
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    image_encoder.to(dtype=weight_dtype)
+    vae.to(dtype=weight_dtype)
+    unet.to(dtype=weight_dtype)
+    pipeline = MVDiffusionImagePipeline(
+        image_encoder=image_encoder, feature_extractor=feature_extractor, vae=vae, unet=unet, safety_checker=None,
+        scheduler=DDIMScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler"),
+        **cfg.pipe_kwargs
+    )
+    if torch.cuda.is_available():
+        pipeline.to('cuda:0')
+    sys.main_lock = threading.Lock()
+    return pipeline
+from utils.misc import load_config
+from omegaconf import OmegaConf
+# parse YAML config to OmegaConf
+cfg = load_config("./configs/mvdiffusion-joint-ortho-6views.yaml")
+# print(cfg)
+schema = OmegaConf.structured(TestConfig)
+# cfg = OmegaConf.load(args.config)
+cfg = OmegaConf.merge(schema, cfg)
+check_dependencies()
+pipeline = load_wonder3d_pipeline(cfg)
+SAMAPI.get_instance()
+torch.set_grad_enabled(False)
+st.title("Wonder3D Demo")
+# st.caption("For faster inference without waiting in queue, you may clone the space and run it yourself.")
+prog = st.progress(0.0, "Idle")
+pic = st.file_uploader("Upload an Image", key='imageinput', type=['png', 'jpg', 'webp'])
+left, right = st.columns(2)
+with left:
+    rem_input_bg = st.checkbox("Remove Input Background")
+with right:
+    rem_output_bg = st.checkbox("Remove Output Background")
+num_inference_steps = st.slider("Number of Inference Steps", 15, 100, 75)
+st.caption("Diffusion Steps. For general real or synthetic objects, around 28 is enough. For objects with delicate details such as faces (either realistic or illustration), you may need 75 or more steps.")
+cfg_scale = st.slider("Classifier Free Guidance Scale", 1.0, 10.0, 4.0)
+seed = st.text_input("Seed", "42")
+submit = False
+if st.button("Submit"):
+    submit = True
+results_container = st.container()
+sample_got = image_examples(iret, 4, 'rimageinput')
+if sample_got:
+    pic = sample_got
+with results_container:
+    if sample_got or submit:
+        prog.progress(0.03, "Waiting in Queue...")
+        with sys.main_lock:
+            seed = int(seed)
+            torch.manual_seed(seed)
+            img = Image.open(pic)
+            if max(img.size) > 1280:
+                w, h = img.size
+                w = round(1280 / max(img.size) * w)
+                h = round(1280 / max(img.size) * h)
+                img = img.resize((w, h))
+            left, right = st.columns(2)
+            with left:
+                st.image(img)
+                st.caption("Input Image")
+            prog.progress(0.1, "Preparing Inputs")
+            if rem_input_bg:
+                with right:
+                    img = segment_img(img)
+                    st.image(img)
+                    st.caption("Input (Background Removed)")
+            img = expand2square(img, (127, 127, 127, 0))
+            pipeline.set_progress_bar_config(disable=True)
+            result = pipeline(
+                img,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=cfg_scale,
+                generator=torch.Generator(pipeline.device).manual_seed(seed),
+                callback=lambda i, t, latents: prog.progress(0.1 + 0.8 * i / num_inference_steps, "Diffusion Step %d" % i)
+            ).images
+            bsz = result.shape[0] // 2
+            normals_pred = result[:bsz]
+            images_pred = result[bsz:]
+            prog.progress(0.9, "Post Processing")
+            left, right = st.columns(2)
+            with left:
+                st.image(pack_6imgs(normals_pred))
+                st.image(pack_6imgs(images_pred))
+                st.caption("Result")
+            if rem_output_bg:
+                normals_pred = segment_6imgs(normals_pred)
+                images_pred = segment_6imgs(images_pred)
+                with right:
+                    st.image(normals_pred)
+                    st.image(images_pred)
+                    st.caption("Result (Background Removed)")
+            prog.progress(1.0, "Idle")

configs/mvdiffusion-joint-ortho-6views.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+pretrained_model_name_or_path: 'lambdalabs/sd-image-variations-diffusers'
+pretrained_unet_path: './ckpts/'
+revision: null
+validation_dataset:
+  root_dir: "./example_images" # the folder path stores testing images
+  num_views: 6
+  bg_color: 'white'
+  img_wh: [256, 256]
+  num_validation_samples: 1000
+  crop_size: 192
+  filepaths: ['owl.png']  # the test image names. leave it empty, test all images in the folder
+save_dir: 'outputs/'
+pred_type: 'joint'
+seed: 42
+validation_batch_size: 1
+dataloader_num_workers: 64
+local_rank: -1
+pipe_kwargs:
+  camera_embedding_type: 'e_de_da_sincos'
+  num_views: 6
+validation_guidance_scales: [3.0]
+pipe_validation_kwargs:
+  eta: 1.0
+validation_grid_nrow: 6
+unet_from_pretrained_kwargs:
+  camera_embedding_type: 'e_de_da_sincos'
+  projection_class_embeddings_input_dim: 10
+  num_views: 6
+  sample_size: 32
+  zero_init_conv_in: false
+  zero_init_camera_projection: false
+num_views: 6
+camera_embedding_type: 'e_de_da_sincos'
+enable_xformers_memory_efficient_attention: true

example_images/14_10_29_489_Tiger_1__1.png ADDED Viewed

example_images/box.png ADDED Viewed

example_images/bread.png ADDED Viewed

example_images/cat.png ADDED Viewed

example_images/cat_head.png ADDED Viewed

example_images/chili.png ADDED Viewed

example_images/duola.png ADDED Viewed

example_images/halloween.png ADDED Viewed

example_images/head.png ADDED Viewed

example_images/kettle.png ADDED Viewed

example_images/kunkun.png ADDED Viewed

example_images/milk.png ADDED Viewed

example_images/owl.png ADDED Viewed

example_images/poro.png ADDED Viewed

example_images/pumpkin.png ADDED Viewed

example_images/skull.png ADDED Viewed

example_images/stone.png ADDED Viewed

example_images/teapot.png ADDED Viewed

example_images/tiger-head-3d-model-obj-stl.png ADDED Viewed

mvdiffusion/data/fixed_poses/four_views/000_back_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-1.000000238418579102e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 -1.343588564850506373e-07 1.000000119209289551e+00 1.746665105883948854e-07
+0.000000000000000000e+00 1.000000119209289551e+00 -1.343588564850506373e-07 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/four_views/000_front_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+1.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 -1.343588564850506373e-07 1.000000119209289551e+00 -1.746665105883948854e-07
+0.000000000000000000e+00 -1.000000119209289551e+00 -1.343588564850506373e-07 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/four_views/000_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.220446049250313081e-16 -1.000000000000000000e+00 0.000000000000000000e+00 -2.886579758146288598e-16
+0.000000000000000000e+00 -2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00
+-1.000000000000000000e+00 0.000000000000000000e+00 -2.220446049250313081e-16 -1.299999952316284180e+00

mvdiffusion/data/fixed_poses/four_views/000_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00 2.886579758146288598e-16
+0.000000000000000000e+00 -2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00
+1.000000000000000000e+00 0.000000000000000000e+00 -2.220446049250313081e-16 -1.299999952316284180e+00

mvdiffusion/data/fixed_poses/nine_views/000_back_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-5.266582965850830078e-01 7.410295009613037109e-01 -4.165407419204711914e-01 -5.960464477539062500e-08
+5.865638996738198330e-08 4.900035560131072998e-01 8.717204332351684570e-01 -9.462351613365171943e-08
+8.500770330429077148e-01 4.590988159179687500e-01 -2.580644786357879639e-01 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/nine_views/000_back_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-9.734988808631896973e-01 1.993551850318908691e-01 -1.120596975088119507e-01 -1.713633537292480469e-07
+3.790224578636980368e-09 4.900034964084625244e-01 8.717204928398132324e-01 1.772203575001185527e-07
+2.286916375160217285e-01 8.486189246177673340e-01 -4.770178496837615967e-01 -1.838477611541748047e+00

mvdiffusion/data/fixed_poses/nine_views/000_back_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+2.286914736032485962e-01 8.486190438270568848e-01 -4.770178198814392090e-01 1.564621925354003906e-07
+-3.417914484771245043e-08 4.900034070014953613e-01 8.717205524444580078e-01 -7.293811421504869941e-08
+9.734990000724792480e-01 -1.993550658226013184e-01 1.120596155524253845e-01 -1.838477969169616699e+00

mvdiffusion/data/fixed_poses/nine_views/000_front_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+5.266583561897277832e-01 -7.410295009613037109e-01 4.165407419204711914e-01 0.000000000000000000e+00
+5.865638996738198330e-08 4.900035560131072998e-01 8.717204332351684570e-01 9.462351613365171943e-08
+-8.500770330429077148e-01 -4.590988159179687500e-01 2.580645382404327393e-01 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/nine_views/000_front_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.286916971206665039e-01 -8.486189842224121094e-01 4.770179092884063721e-01 -2.458691596984863281e-07
+9.085837859856837895e-09 4.900034666061401367e-01 8.717205524444580078e-01 1.205695667749751010e-07
+-9.734990000724792480e-01 1.993551701307296753e-01 -1.120597645640373230e-01 -1.838477969169616699e+00

mvdiffusion/data/fixed_poses/nine_views/000_front_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+9.734989404678344727e-01 -1.993551850318908691e-01 1.120596975088119507e-01 -1.415610313415527344e-07
+3.790224578636980368e-09 4.900034964084625244e-01 8.717204928398132324e-01 -1.772203575001185527e-07
+-2.286916375160217285e-01 -8.486189246177673340e-01 4.770178794860839844e-01 -1.838477611541748047e+00

mvdiffusion/data/fixed_poses/nine_views/000_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-8.500771522521972656e-01 -4.590989053249359131e-01 2.580644488334655762e-01 0.000000000000000000e+00
+-4.257411134744870651e-08 4.900034964084625244e-01 8.717204928398132324e-01 9.006067358541258727e-08
+-5.266583561897277832e-01 7.410295605659484863e-01 -4.165408313274383545e-01 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/nine_views/000_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+8.500770330429077148e-01 4.590989053249359131e-01 -2.580644488334655762e-01 5.960464477539062500e-08
+-4.257411134744870651e-08 4.900034964084625244e-01 8.717204928398132324e-01 -9.006067358541258727e-08
+5.266583561897277832e-01 -7.410295605659484863e-01 4.165407419204711914e-01 -1.300000071525573730e+00

mvdiffusion/data/fixed_poses/nine_views/000_top_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+9.958608150482177734e-01 7.923202216625213623e-02 -4.453715682029724121e-02 -3.098167056236889039e-09
+-9.089154005050659180e-02 8.681122064590454102e-01 -4.879753291606903076e-01 5.784738377201392723e-08
+-2.028124157504862524e-08 4.900035560131072998e-01 8.717204332351684570e-01 -1.300000071525573730e+00

mvdiffusion/data/normal_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+def camNormal2worldNormal(rot_c2w, camNormal):
+    H,W,_ = camNormal.shape
+    normal_img = np.matmul(rot_c2w[None, :, :], camNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    return normal_img
+def worldNormal2camNormal(rot_w2c, normal_map_world):
+    H,W,_ = normal_map_world.shape
+    # normal_img = np.matmul(rot_w2c[None, :, :], worldNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    # faster version
+    # Reshape the normal map into a 2D array where each row represents a normal vector
+    normal_map_flat = normal_map_world.reshape(-1, 3)
+    # Transform the normal vectors using the transformation matrix
+    normal_map_camera_flat = np.dot(normal_map_flat, rot_w2c.T)
+    # Reshape the transformed normal map back to its original shape
+    normal_map_camera = normal_map_camera_flat.reshape(normal_map_world.shape)
+    return normal_map_camera
+def trans_normal(normal, RT_w2c, RT_w2c_target):
+    # normal_world = camNormal2worldNormal(np.linalg.inv(RT_w2c[:3,:3]), normal)
+    # normal_target_cam = worldNormal2camNormal(RT_w2c_target[:3,:3], normal_world)
+    relative_RT = np.matmul(RT_w2c_target[:3,:3], np.linalg.inv(RT_w2c[:3,:3]))
+    normal_target_cam = worldNormal2camNormal(relative_RT[:3,:3], normal)
+    return normal_target_cam
+def img2normal(img):
+    return (img/255.)*2-1
+def normal2img(normal):
+    return np.uint8((normal*0.5+0.5)*255)
+def norm_normalize(normal, dim=-1):
+    normal = normal/(np.linalg.norm(normal, axis=dim, keepdims=True)+1e-6)
+    return normal

mvdiffusion/data/objaverse_dataset.py ADDED Viewed

	@@ -0,0 +1,608 @@

+from typing import Dict
+import numpy as np
+from omegaconf import DictConfig, ListConfig
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange
+from typing import Literal, Tuple, Optional, Any
+import cv2
+import random
+import json
+import os, sys
+import math
+import PIL.Image
+from .normal_utils import trans_normal, normal2img, img2normal
+import pdb
+def shift_list(lst, n):
+    length = len(lst)
+    n = n % length  # Ensure n is within the range of the list length
+    return lst[-n:] + lst[:-n]
+class ObjaverseDataset(Dataset):
+    def __init__(self,
+        root_dir: str,
+        num_views: int,
+        bg_color: Any,
+        img_wh: Tuple[int, int],
+        object_list: str,
+        groups_num: int=1,
+        validation: bool = False,
+        random_views: bool = False,
+        num_validation_samples: int = 64,
+        num_samples: Optional[int] = None,
+        invalid_list: Optional[str] = None,
+        trans_norm_system: bool = True,   # if True, transform all normals map into the cam system of front view
+        augment_data: bool = False,
+        read_normal: bool = True,
+        read_color: bool = False,
+        read_depth: bool = False,
+        mix_color_normal: bool = False,
+        random_view_and_domain: bool = False
+        ) -> None:
+        """Create a dataset from a folder of images.
+        If you pass in a root directory it will be searched for images
+        ending in ext (ext can be a list)
+        """
+        self.root_dir = Path(root_dir)
+        self.num_views = num_views
+        self.bg_color = bg_color
+        self.validation = validation
+        self.num_samples = num_samples
+        self.trans_norm_system = trans_norm_system
+        self.augment_data = augment_data
+        self.invalid_list = invalid_list
+        self.groups_num = groups_num
+        print("augment data: ", self.augment_data)
+        self.img_wh = img_wh
+        self.read_normal = read_normal
+        self.read_color = read_color
+        self.read_depth = read_depth
+        self.mix_color_normal = mix_color_normal  # mix load color and normal maps
+        self.random_view_and_domain = random_view_and_domain # load normal or rgb of a single view
+        self.random_views = random_views
+        if not self.random_views:
+            if self.num_views == 4:
+                self.view_types  = ['front', 'right', 'back', 'left']
+            elif self.num_views == 5:
+                self.view_types  = ['front', 'front_right', 'right', 'back', 'left']
+            elif self.num_views == 6 or self.num_views==1:
+                self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        else:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.fix_cam_pose_dir = "./mvdiffusion/data/fixed_poses/nine_views"
+        self.fix_cam_poses = self.load_fixed_poses()  # world2cam matrix
+        if object_list is not None:
+            with open(object_list) as f:
+                self.objects = json.load(f)
+            self.objects = [os.path.basename(o).replace(".glb", "") for o in self.objects]
+        else:
+            self.objects = os.listdir(self.root_dir)
+            self.objects = sorted(self.objects)
+        if self.invalid_list is not None:
+            with open(self.invalid_list) as f:
+                self.invalid_objects = json.load(f)
+            self.invalid_objects = [os.path.basename(o).replace(".glb", "") for o in self.invalid_objects]
+        else:
+            self.invalid_objects = []
+        self.all_objects = set(self.objects) - (set(self.invalid_objects) & set(self.objects))
+        self.all_objects = list(self.all_objects)
+        if not validation:
+            self.all_objects = self.all_objects[:-num_validation_samples]
+        else:
+            self.all_objects = self.all_objects[-num_validation_samples:]
+        if num_samples is not None:
+            self.all_objects = self.all_objects[:num_samples]
+        print("loading ", len(self.all_objects), " objects in the dataset")
+        if self.mix_color_normal:
+            self.backup_data = self.__getitem_mix__(0, "9438abf986c7453a9f4df7c34aa2e65b")
+        elif self.random_view_and_domain:
+            self.backup_data = self.__getitem_random_viewanddomain__(0, "9438abf986c7453a9f4df7c34aa2e65b")
+        else:
+            self.backup_data = self.__getitem_norm__(0, "9438abf986c7453a9f4df7c34aa2e65b") # "66b2134b7e3645b29d7c349645291f78")
+    def __len__(self):
+        return len(self.objects)*self.total_view
+    def load_fixed_poses(self):
+        poses = {}
+        for face in self.view_types:
+            RT = np.loadtxt(os.path.join(self.fix_cam_pose_dir,'%03d_%s_RT.txt'%(0, face)))
+            poses[face] = RT
+        return poses
+    def cartesian_to_spherical(self, xyz):
+        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
+        xy = xyz[:,0]**2 + xyz[:,1]**2
+        z = np.sqrt(xy + xyz[:,2]**2)
+        theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
+        #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
+        azimuth = np.arctan2(xyz[:,1], xyz[:,0])
+        return np.array([theta, azimuth, z])
+    def get_T(self, target_RT, cond_RT):
+        R, T = target_RT[:3, :3], target_RT[:, -1]
+        T_target = -R.T @ T # change to cam2world
+        R, T = cond_RT[:3, :3], cond_RT[:, -1]
+        T_cond = -R.T @ T
+        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
+        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
+        d_theta = theta_target - theta_cond
+        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
+        d_z = z_target - z_cond
+        # d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
+        return d_theta, d_azimuth
+    def get_bg_color(self):
+        if self.bg_color == 'white':
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        elif self.bg_color == 'black':
+            bg_color = np.array([0., 0., 0.], dtype=np.float32)
+        elif self.bg_color == 'gray':
+            bg_color = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+        elif self.bg_color == 'random':
+            bg_color = np.random.rand(3)
+        elif self.bg_color == 'three_choices':
+            white = np.array([1., 1., 1.], dtype=np.float32)
+            black = np.array([0., 0., 0.], dtype=np.float32)
+            gray = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+            bg_color = random.choice([white, black, gray])
+        elif isinstance(self.bg_color, float):
+            bg_color = np.array([self.bg_color] * 3, dtype=np.float32)
+        else:
+            raise NotImplementedError
+        return bg_color
+    def load_mask(self, img_path, return_type='np'):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        img = np.array(Image.open(img_path).resize(self.img_wh))
+        img = np.float32(img > 0)
+        assert len(np.shape(img)) == 2
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def load_image(self, img_path, bg_color, alpha, return_type='np'):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        img = np.array(Image.open(img_path).resize(self.img_wh))
+        img = img.astype(np.float32) / 255. # [0, 1]
+        assert img.shape[-1] == 3 # RGB
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def load_depth(self, img_path, bg_color, alpha, return_type='np'):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        img = np.array(Image.open(img_path).resize(self.img_wh))
+        img = img.astype(np.float32) / 65535. # [0, 1]
+        img[img > 0.4] = 0
+        img = img / 0.4
+        assert img.ndim == 2 # depth
+        img = np.stack([img]*3, axis=-1)
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        # print(np.max(img[:, :, 0]))
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def load_normal(self, img_path, bg_color, alpha, RT_w2c=None, RT_w2c_cond=None, return_type='np'):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        normal = np.array(Image.open(img_path).resize(self.img_wh))
+        assert normal.shape[-1] == 3 # RGB
+        normal = trans_normal(img2normal(normal), RT_w2c, RT_w2c_cond)
+        img = (normal*0.5 + 0.5).astype(np.float32)  # [0, 1]
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def __len__(self):
+        return len(self.all_objects)
+    def __getitem_mix__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        if random.random() < 0.5:
+            read_color, read_normal, read_depth = True, False, False
+        else:
+            read_color, read_normal, read_depth = False, True, True
+        read_normal = read_normal & self.read_normal
+        read_depth = read_depth & self.read_depth
+        assert (read_color and (read_normal or read_depth)) is False
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        cond_alpha = self.load_mask(os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, cond_view)), return_type='np')
+        img_tensors_in = [
+            self.load_image(os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, cond_view)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        ] * self.num_views
+        img_tensors_out = []
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, view))
+            mask_path = os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, view))
+            normal_path = os.path.join(self.root_dir,  object_name[:3], object_name, "normals_%03d_%s.png" % (set_idx, view))
+            depth_path = os.path.join(self.root_dir,  object_name[:3], object_name, "depth_%03d_%s.png" % (set_idx, view))
+            alpha = self.load_mask(mask_path, return_type='np')
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt")
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt").permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            if read_depth:
+                depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt").permute(2, 0, 1)
+                img_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        if read_normal or read_depth:
+            task_embeddings = normal_task_embeddings
+        if read_color:
+            task_embeddings = color_task_embeddings
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings
+        }
+    def __getitem_random_viewanddomain__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        if random.random() < 0.5:
+            read_color, read_normal, read_depth = True, False, False
+        else:
+            read_color, read_normal, read_depth = False, True, True
+        read_normal = read_normal & self.read_normal
+        read_depth = read_depth & self.read_depth
+        assert (read_color and (read_normal or read_depth)) is False
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        cond_alpha = self.load_mask(os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, cond_view)), return_type='np')
+        img_tensors_in = [
+            self.load_image(os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, cond_view)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        ] * self.num_views
+        img_tensors_out = []
+        random_viewidx = random.randint(0, len(view_types)-1)
+        for view, tgt_w2c in zip([view_types[random_viewidx]], [tgt_w2cs[random_viewidx]]):
+            img_path = os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, view))
+            mask_path = os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, view))
+            normal_path = os.path.join(self.root_dir,  object_name[:3], object_name, "normals_%03d_%s.png" % (set_idx, view))
+            depth_path = os.path.join(self.root_dir,  object_name[:3], object_name, "depth_%03d_%s.png" % (set_idx, view))
+            alpha = self.load_mask(mask_path, return_type='np')
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt")
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt").permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            if read_depth:
+                depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt").permute(2, 0, 1)
+                img_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        if read_normal or read_depth:
+            task_embeddings = normal_task_embeddings
+        if read_color:
+            task_embeddings = color_task_embeddings
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings
+        }
+    def __getitem_norm__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        # if self.random_views:
+        #     view_types = ['front']+random.sample(self.view_types[1:], 3)
+        # else:
+        #     view_types = self.view_types
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        cond_alpha = self.load_mask(os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, cond_view)), return_type='np')
+        img_tensors_in = [
+            self.load_image(os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, cond_view)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        ] * self.num_views
+        img_tensors_out = []
+        normal_tensors_out = []
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(self.root_dir,  object_name[:3], object_name, "rgb_%03d_%s.png" % (set_idx, view))
+            mask_path = os.path.join(self.root_dir,  object_name[:3], object_name, "mask_%03d_%s.png" % (set_idx, view))
+            alpha = self.load_mask(mask_path, return_type='np')
+            if self.read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt")
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if self.read_normal:
+                normal_path = os.path.join(self.root_dir,  object_name[:3], object_name, "normals_%03d_%s.png" % (set_idx, view))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt").permute(2, 0, 1)
+                normal_tensors_out.append(normal_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        if self.read_color:
+            img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        if self.read_normal:
+            normal_tensors_out = torch.stack(normal_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'normal_task_embeddings': normal_task_embeddings,
+            'color_task_embeddings': color_task_embeddings
+        }
+    def __getitem__(self, index):
+        try:
+            if self.mix_color_normal:
+                data = self.__getitem_mix__(index)
+            elif self.random_view_and_domain:
+                data = self.__getitem_random_viewanddomain__(index)
+            else:
+                data = self.__getitem_norm__(index)
+            return data
+        except:
+            print("load error ", self.all_objects[index%len(self.all_objects)] )
+            return self.backup_data
+class ConcatDataset(torch.utils.data.Dataset):
+    def __init__(self, datasets, weights):
+        self.datasets = datasets
+        self.weights = weights
+        self.num_datasets = len(datasets)
+    def __getitem__(self, i):
+        chosen = random.choices(self.datasets, self.weights, k=1)[0]
+        return chosen[i]
+    def __len__(self):
+        return max(len(d) for d in self.datasets)
+if __name__ == "__main__":
+    train_dataset = ObjaverseDataset(
+        root_dir="/ghome/l5/xxlong/.objaverse/hf-objaverse-v1/renderings",
+        size=(128, 128),
+        ext="hdf5",
+        default_trans=torch.zeros(3),
+        return_paths=False,
+        total_view=8,
+        validation=False,
+        object_list=None,
+        views_mode='fourviews'
+    )
+    data0 = train_dataset[0]
+    data1  = train_dataset[50]
+    # print(data)

mvdiffusion/data/single_image_dataset.py ADDED Viewed

	@@ -0,0 +1,321 @@

+from typing import Dict
+import numpy as np
+from omegaconf import DictConfig, ListConfig
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange
+from typing import Literal, Tuple, Optional, Any
+import cv2
+import random
+import json
+import os, sys
+import math
+from glob import glob
+import PIL.Image
+from .normal_utils import trans_normal, normal2img, img2normal
+import pdb
+import cv2
+import numpy as np
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+def scale_and_place_object(image, scale_factor):
+    assert np.shape(image)[-1]==4  # RGBA
+    # Extract the alpha channel (transparency) and the object (RGB channels)
+    alpha_channel = image[:, :, 3]
+    # Find the bounding box coordinates of the object
+    coords = cv2.findNonZero(alpha_channel)
+    x, y, width, height = cv2.boundingRect(coords)
+    # Calculate the scale factor for resizing
+    original_height, original_width = image.shape[:2]
+    if width > height:
+        size = width
+        original_size = original_width
+    else:
+        size = height
+        original_size = original_height
+    scale_factor = min(scale_factor, size / (original_size+0.0))
+    new_size = scale_factor * original_size
+    scale_factor = new_size / size
+    # Calculate the new size based on the scale factor
+    new_width = int(width * scale_factor)
+    new_height = int(height * scale_factor)
+    center_x = original_width // 2
+    center_y = original_height // 2
+    paste_x = center_x - (new_width // 2)
+    paste_y = center_y - (new_height // 2)
+    # Resize the object (RGB channels) to the new size
+    rescaled_object = cv2.resize(image[y:y+height, x:x+width], (new_width, new_height))
+    # Create a new RGBA image with the resized image
+    new_image = np.zeros((original_height, original_width, 4), dtype=np.uint8)
+    new_image[paste_y:paste_y + new_height, paste_x:paste_x + new_width] = rescaled_object
+    return new_image
+class SingleImageDataset(Dataset):
+    def __init__(self,
+        root_dir: str,
+        num_views: int,
+        img_wh: Tuple[int, int],
+        bg_color: str,
+        crop_size: int = 224,
+        num_validation_samples: Optional[int] = None,
+        filepaths: Optional[list] = None,
+        cond_type: Optional[str] = None
+        ) -> None:
+        """Create a dataset from a folder of images.
+        If you pass in a root directory it will be searched for images
+        ending in ext (ext can be a list)
+        """
+        self.root_dir = Path(root_dir)
+        self.num_views = num_views
+        self.img_wh = img_wh
+        self.crop_size = crop_size
+        self.bg_color = bg_color
+        self.cond_type = cond_type
+        if self.num_views == 4:
+            self.view_types  = ['front', 'right', 'back', 'left']
+        elif self.num_views == 5:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left']
+        elif self.num_views == 6:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.fix_cam_pose_dir = "./mvdiffusion/data/fixed_poses/nine_views"
+        self.fix_cam_poses = self.load_fixed_poses()  # world2cam matrix
+        if filepaths is None:
+            # Get a list of all files in the directory
+            file_list = os.listdir(self.root_dir)
+        else:
+            file_list = filepaths
+        if self.cond_type == None:
+            # Filter the files that end with .png or .jpg
+            self.file_list = [file for file in file_list if file.endswith(('.png', '.jpg'))]
+            self.cond_dirs = None
+        else:
+            self.file_list = []
+            self.cond_dirs = []
+            for scene in file_list:
+                self.file_list.append(os.path.join(scene, f"{scene}.png"))
+                if self.cond_type == 'normals':
+                    self.cond_dirs.append(os.path.join(self.root_dir, scene, 'outs'))
+                else:
+                    self.cond_dirs.append(os.path.join(self.root_dir, scene))
+        # load all images
+        self.all_images = []
+        self.all_alphas = []
+        bg_color = self.get_bg_color()
+        for file in self.file_list:
+            image, alpha = self.load_image(os.path.join(self.root_dir, file), bg_color, return_type='pt')
+            self.all_images.append(image)
+            self.all_alphas.append(alpha)
+        self.all_images = self.all_images[:num_validation_samples]
+        self.all_alphas = self.all_alphas[:num_validation_samples]
+    def __len__(self):
+        return len(self.all_images)
+    def load_fixed_poses(self):
+        poses = {}
+        for face in self.view_types:
+            RT = np.loadtxt(os.path.join(self.fix_cam_pose_dir,'%03d_%s_RT.txt'%(0, face)))
+            poses[face] = RT
+        return poses
+    def cartesian_to_spherical(self, xyz):
+        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
+        xy = xyz[:,0]**2 + xyz[:,1]**2
+        z = np.sqrt(xy + xyz[:,2]**2)
+        theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
+        #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
+        azimuth = np.arctan2(xyz[:,1], xyz[:,0])
+        return np.array([theta, azimuth, z])
+    def get_T(self, target_RT, cond_RT):
+        R, T = target_RT[:3, :3], target_RT[:, -1]
+        T_target = -R.T @ T # change to cam2world
+        R, T = cond_RT[:3, :3], cond_RT[:, -1]
+        T_cond = -R.T @ T
+        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
+        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
+        d_theta = theta_target - theta_cond
+        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
+        d_z = z_target - z_cond
+        # d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
+        return d_theta, d_azimuth
+    def get_bg_color(self):
+        if self.bg_color == 'white':
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        elif self.bg_color == 'black':
+            bg_color = np.array([0., 0., 0.], dtype=np.float32)
+        elif self.bg_color == 'gray':
+            bg_color = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+        elif self.bg_color == 'random':
+            bg_color = np.random.rand(3)
+        elif isinstance(self.bg_color, float):
+            bg_color = np.array([self.bg_color] * 3, dtype=np.float32)
+        else:
+            raise NotImplementedError
+        return bg_color
+    def load_image(self, img_path, bg_color, return_type='np'):
+        # pil always returns uint8
+        image_input = Image.open(img_path)
+        image_size = self.img_wh[0]
+        if self.crop_size!=-1:
+            alpha_np = np.asarray(image_input)[:, :, 3]
+            coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+            min_x, min_y = np.min(coords, 0)
+            max_x, max_y = np.max(coords, 0)
+            ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+            h, w = ref_img_.height, ref_img_.width
+            scale = self.crop_size / max(h, w)
+            h_, w_ = int(scale * h), int(scale * w)
+            ref_img_ = ref_img_.resize((w_, h_), resample=Image.BICUBIC)
+            image_input = add_margin(ref_img_, size=image_size)
+        else:
+            image_input = add_margin(image_input, size=max(image_input.height, image_input.width))
+            image_input = image_input.resize((image_size, image_size), resample=Image.BICUBIC)
+        # img = scale_and_place_object(img, self.scale_ratio)
+        img = np.array(image_input)
+        img = img.astype(np.float32) / 255. # [0, 1]
+        assert img.shape[-1] == 4 # RGBA
+        alpha = img[...,3:4]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+            alpha = torch.from_numpy(alpha)
+        else:
+            raise NotImplementedError
+        return img, alpha
+    def load_conds(self, directory):
+        assert self.crop_size == -1
+        image_size = self.img_wh[0]
+        conds = []
+        for view in self.view_types:
+            cond_file = f"{self.cond_type}_000_{view}.png"
+            image_input = Image.open(os.path.join(directory, cond_file))
+            image_input = image_input.resize((image_size, image_size), resample=Image.BICUBIC)
+            image_input = np.array(image_input)[:, :, :3] / 255.
+            conds.append(image_input)
+        conds = np.stack(conds, axis=0)
+        conds = torch.from_numpy(conds).permute(0, 3, 1, 2)  # B, 3, H, W
+        return conds
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        image = self.all_images[index%len(self.all_images)]
+        alpha = self.all_alphas[index%len(self.all_images)]
+        filename = self.file_list[index%len(self.all_images)].replace(".png", "")
+        if self.cond_type != None:
+            conds = self.load_conds(self.cond_dirs[index%len(self.all_images)])
+        else:
+            conds = None
+        cond_w2c = self.fix_cam_poses['front']
+        tgt_w2cs = [self.fix_cam_poses[view] for view in self.view_types]
+        elevations = []
+        azimuths = []
+        img_tensors_in = [
+            image.permute(2, 0, 1)
+        ] * self.num_views
+        alpha_tensors_in = [
+            alpha.permute(2, 0, 1)
+        ] * self.num_views
+        for view, tgt_w2c in zip(self.view_types, tgt_w2cs):
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        alpha_tensors_in = torch.stack(alpha_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        out =  {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'alphas': alpha_tensors_in,
+            'camera_embeddings': camera_embeddings,
+            'normal_task_embeddings': normal_task_embeddings,
+            'color_task_embeddings': color_task_embeddings,
+            'filename': filename,
+        }
+        if conds is not None:
+            out['conds'] = conds
+        return out

mvdiffusion/models/transformer_mv2d.py ADDED Viewed

	@@ -0,0 +1,1005 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate, maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, AdaLayerNorm, AdaLayerNormZero, Attention
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+import pdb
+import random
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+@dataclass
+class TransformerMV2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class TransformerMV2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        num_views: int = 1,
+        joint_attention: bool=False,
+        joint_attention_twice: bool=False,
+        multiview_attention: bool=True,
+        cross_domain_attention: bool=False
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = LoRACompatibleLinear(in_channels, inner_dim)
+            else:
+                self.proj_in = LoRACompatibleConv(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicMVTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    num_views=num_views,
+                    joint_attention=joint_attention,
+                    joint_attention_twice=joint_attention_twice,
+                    multiview_attention=multiview_attention,
+                    cross_domain_attention=cross_domain_attention
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = LoRACompatibleLinear(inner_dim, in_channels)
+            else:
+                self.proj_out = LoRACompatibleConv(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches:
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            hidden_states = self.pos_embed(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        elif self.is_input_patches:
+            # TODO: cleanup!
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict:
+            return (output,)
+        return TransformerMV2DModelOutput(sample=output)
+@maybe_allow_in_graph
+class BasicMVTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        num_views: int = 1,
+        joint_attention: bool = False,
+        joint_attention_twice: bool = False,
+        multiview_attention: bool = True,
+        cross_domain_attention: bool = False
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.multiview_attention = multiview_attention
+        self.cross_domain_attention = cross_domain_attention
+        self.attn1 = CustomAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            processor=MVAttnProcessor()
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+        self.num_views = num_views
+        self.joint_attention = joint_attention
+        if self.joint_attention:
+            # Joint task -Attn
+            self.attn_joint = CustomJointAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                processor=JointAttnProcessor()
+            )
+            nn.init.zeros_(self.attn_joint.to_out[0].weight.data)
+            self.norm_joint = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        self.joint_attention_twice = joint_attention_twice
+        if self.joint_attention_twice:
+            print("joint twice")
+            # Joint task -Attn
+            self.attn_joint_twice = CustomJointAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                processor=JointAttnProcessor()
+            )
+            nn.init.zeros_(self.attn_joint_twice.to_out[0].weight.data)
+            self.norm_joint_twice = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        assert attention_mask is None # not supported yet
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            num_views=self.num_views,
+            multiview_attention=self.multiview_attention,
+            cross_domain_attention=self.cross_domain_attention,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # joint attention twice
+        if self.joint_attention_twice:
+            norm_hidden_states = (
+                self.norm_joint_twice(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_joint_twice(hidden_states)
+            )
+            hidden_states = self.attn_joint_twice(norm_hidden_states) + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        if self.joint_attention:
+            norm_hidden_states = (
+                self.norm_joint(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_joint(hidden_states)
+            )
+            hidden_states = self.attn_joint(norm_hidden_states) + hidden_states
+        return hidden_states
+class CustomAttention(Attention):
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, *args, **kwargs
+    ):
+        processor = XFormersMVAttnProcessor()
+        self.set_processor(processor)
+        # print("using xformers attention processor")
+class CustomJointAttention(Attention):
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, *args, **kwargs
+    ):
+        processor = XFormersJointAttnProcessor()
+        self.set_processor(processor)
+        # print("using xformers attention processor")
+class MVAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_views=1,
+        multiview_attention=True
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+        #([bx4, 1024, 320]) key torch.Size([bx4, 1024, 320]) value torch.Size([bx4, 1024, 320])
+        # pdb.set_trace()
+        # multi-view self-attention
+        if multiview_attention:
+            if num_views <= 6:
+                # after use xformer; possible to train with 6 views
+                key = rearrange(key, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+                value = rearrange(value, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+            else:# apply sparse attention
+                pass
+                # print("use sparse attention")
+                # # seems that the sparse random sampling cause problems
+                # # don't use random sampling, just fix the indexes
+                # onekey = rearrange(key, "(b t) d c -> b t d c", t=num_views)
+                # onevalue = rearrange(value, "(b t) d c -> b t d c", t=num_views)
+                # allkeys = []
+                # allvalues = []
+                # all_indexes = {
+                #     0 : [0, 2, 3, 4],
+                #     1: [0, 1, 3, 5],
+                #     2: [0, 2, 3, 4],
+                #     3: [0, 2, 3, 4],
+                #     4: [0, 2, 3, 4],
+                #     5: [0, 1, 3, 5]
+                # }
+                # for jj in range(num_views):
+                #     # valid_index = [x for x in range(0, num_views) if x!= jj]
+                #     # indexes = random.sample(valid_index, 3) + [jj] + [0]
+                #     indexes = all_indexes[jj]
+                #     indexes = torch.tensor(indexes).long().to(key.device)
+                #     allkeys.append(onekey[:, indexes])
+                #     allvalues.append(onevalue[:, indexes])
+                # keys = torch.stack(allkeys, dim=1) # checked, should be dim=1
+                # values = torch.stack(allvalues, dim=1)
+                # key = rearrange(keys, 'b t f d c -> (b t) (f d) c')
+                # value = rearrange(values, 'b t f d c -> (b t) (f d) c')
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class XFormersMVAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_views=1.,
+        multiview_attention=True,
+        cross_domain_attention=False,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # from yuancheng; here attention_mask is None
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key_raw = attn.to_k(encoder_hidden_states)
+        value_raw = attn.to_v(encoder_hidden_states)
+        # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+        #([bx4, 1024, 320]) key torch.Size([bx4, 1024, 320]) value torch.Size([bx4, 1024, 320])
+        # pdb.set_trace()
+        # multi-view self-attention
+        if multiview_attention:
+            key = rearrange(key_raw, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+            value = rearrange(value_raw, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+            if cross_domain_attention:
+                # memory efficient, cross domain attention
+                key_0, key_1 = torch.chunk(key_raw, dim=0, chunks=2)  # keys shape (b t) d c
+                value_0, value_1 = torch.chunk(value_raw, dim=0, chunks=2)
+                key_cross = torch.concat([key_1, key_0], dim=0)
+                value_cross = torch.concat([value_1, value_0], dim=0) #  shape (b t) d c
+                key = torch.cat([key, key_cross], dim=1)
+                value = torch.cat([value, value_cross], dim=1)  #  shape (b t) (t+1 d) c
+        else:
+            # print("don't use multiview attention.")
+            key = key_raw
+            value = value_raw
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class XFormersJointAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_tasks=2
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # from yuancheng; here attention_mask is None
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        assert num_tasks == 2  # only support two tasks now
+        key_0, key_1 = torch.chunk(key, dim=0, chunks=2)  # keys shape (b t) d c
+        value_0, value_1 = torch.chunk(value, dim=0, chunks=2)
+        key = torch.cat([key_0, key_1], dim=1)  # (b t) 2d c
+        value = torch.cat([value_0, value_1], dim=1)  # (b t) 2d c
+        key = torch.cat([key]*2, dim=0)   # ( 2 b t) 2d c
+        value = torch.cat([value]*2, dim=0)  # (2 b t) 2d c
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class JointAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_tasks=2
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        assert num_tasks == 2  # only support two tasks now
+        key_0, key_1 = torch.chunk(key, dim=0, chunks=2)  # keys shape (b t) d c
+        value_0, value_1 = torch.chunk(value, dim=0, chunks=2)
+        key = torch.cat([key_0, key_1], dim=1)  # (b t) 2d c
+        value = torch.cat([value_0, value_1], dim=1)  # (b t) 2d c
+        key = torch.cat([key]*2, dim=0)   # ( 2 b t) 2d c
+        value = torch.cat([value]*2, dim=0)  # (2 b t) 2d c
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

mvdiffusion/models/unet_mv2d_blocks.py ADDED Viewed

	@@ -0,0 +1,880 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import is_torch_version, logging
+from diffusers.models.attention import AdaGroupNorm
+from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from mvdiffusion.models.transformer_mv2d import TransformerMV2DModel
+from diffusers.models.unet_2d_blocks import DownBlock2D, ResnetDownsampleBlock2D, AttnDownBlock2D, CrossAttnDownBlock2D, SimpleCrossAttnDownBlock2D, SkipDownBlock2D, AttnSkipDownBlock2D, DownEncoderBlock2D, AttnDownEncoderBlock2D, KDownBlock2D, KCrossAttnDownBlock2D
+from diffusers.models.unet_2d_blocks import UpBlock2D, ResnetUpsampleBlock2D, CrossAttnUpBlock2D, SimpleCrossAttnUpBlock2D, AttnUpBlock2D, SkipUpBlock2D, AttnSkipUpBlock2D, UpDecoderBlock2D, AttnUpDecoderBlock2D, KUpBlock2D, KCrossAttnUpBlock2D
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    downsample_type=None,
+    num_views=1
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    # custom MV2D attention block
+    elif down_block_type == "CrossAttnDownBlockMV2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMV2D")
+        return CrossAttnDownBlockMV2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            num_views=num_views
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    upsample_type=None,
+    num_views=1
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    # custom MV2D attention block
+    elif up_block_type == "CrossAttnUpBlockMV2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMV2D")
+        return CrossAttnUpBlockMV2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            num_views=num_views
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlockMV2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        num_views: int = 1,
+        joint_attention: bool = False,
+        joint_attention_twice: bool = False,
+        multiview_attention: bool = True,
+        cross_domain_attention: bool=False
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views,
+                        joint_attention=joint_attention,
+                        joint_attention_twice=joint_attention_twice,
+                        multiview_attention=multiview_attention,
+                        cross_domain_attention=cross_domain_attention
+                    )
+                )
+            else:
+                raise NotImplementedError
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnUpBlockMV2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        num_views: int = 1
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views
+                    )
+                )
+            else:
+                raise NotImplementedError
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    **ckpt_kwargs,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class CrossAttnDownBlockMV2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        num_views: int = 1
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views
+                    )
+                )
+            else:
+                raise NotImplementedError
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals=None,
+    ):
+        output_states = ()
+        blocks = list(zip(self.resnets, self.attentions))
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    **ckpt_kwargs,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states

mvdiffusion/models/unet_mv2d_condition.py ADDED Viewed

	@@ -0,0 +1,1462 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import os
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin, load_state_dict, _load_state_dict_into_model
+from diffusers.models.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+)
+from diffusers.utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    FLAX_WEIGHTS_NAME,
+    HF_HUB_OFFLINE,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_safetensors_available,
+    is_torch_version,
+    logging,
+)
+from diffusers import __version__
+from mvdiffusion.models.unet_mv2d_blocks import (
+    CrossAttnDownBlockMV2D,
+    CrossAttnUpBlockMV2D,
+    UNetMidBlockMV2DCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNetMV2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNetMV2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlockMV2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlockMV2D", "CrossAttnUpBlockMV2D", "CrossAttnUpBlockMV2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        num_views: int = 1,
+        joint_attention: bool = False,
+        joint_attention_twice: bool = False,
+        multiview_attention: bool = True,
+        cross_domain_attention: bool = False
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                num_views=num_views
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        # custom MV2D attention block
+        elif mid_block_type == "UNetMidBlockMV2DCrossAttn":
+            self.mid_block = UNetMidBlockMV2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                num_views=num_views,
+                joint_attention=joint_attention,
+                joint_attention_twice=joint_attention_twice,
+                multiview_attention=multiview_attention,
+                cross_domain_attention=cross_domain_attention
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                num_views=num_views
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, CrossAttnDownBlockMV2D, DownBlock2D, CrossAttnUpBlock2D, CrossAttnUpBlockMV2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNetMV2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNetMV2DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(
+            cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+            camera_embedding_type: str, num_views: int, sample_size: int,
+            zero_init_conv_in: bool = True, zero_init_camera_projection: bool = False,
+            projection_class_embeddings_input_dim: int=6, joint_attention: bool = False,
+            joint_attention_twice: bool = False, multiview_attention: bool = True,
+            cross_domain_attention: bool = False,
+            in_channels: int = 8, out_channels: int = 4,
+            **kwargs
+        ):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+        If you get the error message below, you need to finetune the weights for your downstream task:
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        if use_safetensors and not is_safetensors_available():
+            raise ValueError(
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+            )
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = is_safetensors_available()
+            allow_pickle = True
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # modify config
+        config["_class_name"] = cls.__name__
+        config['in_channels'] = in_channels
+        config['out_channels'] = out_channels
+        config['sample_size'] = sample_size # training resolution
+        config['num_views'] = num_views
+        config['joint_attention'] = joint_attention
+        config['joint_attention_twice'] = joint_attention_twice
+        config['multiview_attention'] = multiview_attention
+        config['cross_domain_attention'] = cross_domain_attention
+        config["down_block_types"] = [
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "DownBlock2D"
+        ]
+        config['mid_block_type'] = "UNetMidBlockMV2DCrossAttn"
+        config["up_block_types"] = [
+            "UpBlock2D",
+            "CrossAttnUpBlockMV2D",
+            "CrossAttnUpBlockMV2D",
+            "CrossAttnUpBlockMV2D"
+        ]
+        config['class_embed_type'] = 'projection'
+        if camera_embedding_type == 'e_de_da_sincos':
+            config['projection_class_embeddings_input_dim'] = projection_class_embeddings_input_dim # default 6
+        else:
+            raise NotImplementedError
+        # load model
+        model_file = None
+        if from_flax:
+            raise NotImplementedError
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            model = cls.from_config(config, **unused_kwargs)
+            state_dict = load_state_dict(model_file, variant=variant)
+            model._convert_deprecated_attention_blocks(state_dict)
+            conv_in_weight = state_dict['conv_in.weight']
+            conv_out_weight = state_dict['conv_out.weight']
+            model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model_2d(
+                model,
+                state_dict,
+                model_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=True,
+            )
+            if any([key == 'conv_in.weight' for key, _, _ in mismatched_keys]):
+                # initialize from the original SD structure
+                model.conv_in.weight.data[:,:4] = conv_in_weight
+            # whether to place all zero to new layers?
+            if zero_init_conv_in:
+                model.conv_in.weight.data[:,4:] = 0.
+            if any([key == 'conv_out.weight' for key, _, _ in mismatched_keys]):
+                # initialize from the original SD structure
+                model.conv_out.weight.data[:,:4] = conv_out_weight
+                if out_channels == 8: # copy for the last 4 channels
+                    model.conv_out.weight.data[:, 4:] = conv_out_weight
+            if zero_init_camera_projection:
+                for p in model.class_embedding.parameters():
+                    torch.nn.init.zeros_(p)
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model_2d(
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs

mvdiffusion/pipelines/pipeline_mvdiffusion_image.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from typing import Callable, List, Optional, Union
+import PIL
+import torch
+import torchvision.transforms.functional as TF
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging, randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class MVDiffusionImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate image variations from an input image using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        camera_embedding_type: str = 'e_de_da_sincos',
+        num_views: int = 4
+    ):
+        super().__init__()
+        if safety_checker is None and requires_safety_checker:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.camera_embedding_type: str = camera_embedding_type
+        self.num_views: int = num_views
+    def _encode_image(self, image_pil, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        image_pt = self.feature_extractor(images=image_pil, return_tensors="pt").pixel_values
+        image_pt = image_pt.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image_pt).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(num_images_per_prompt, 1, 1)
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        image_pt = torch.stack([TF.to_tensor(img) for img in image_pil], dim=0).to(device)
+        image_pt = image_pt * 2.0 - 1.0
+        image_latents = self.vae.encode(image_pt).latent_dist.mode() * self.vae.config.scaling_factor
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        image_latents = image_latents.repeat(num_images_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            image_latents = torch.cat([torch.zeros_like(image_latents), image_latents])
+        return image_embeddings, image_latents
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_camera_embedding(self, camera_embedding: Union[float, torch.Tensor], do_classifier_free_guidance, num_images_per_prompt=1):
+        # (B, 3)
+        camera_embedding = camera_embedding.to(dtype=self.unet.dtype, device=self.unet.device)
+        if self.camera_embedding_type == 'e_de_da_sincos':
+            # (B, 6)
+            camera_embedding = torch.cat([
+                torch.sin(camera_embedding),
+                torch.cos(camera_embedding)
+            ], dim=-1)
+            assert self.unet.config.class_embed_type == 'projection'
+            assert self.unet.config.projection_class_embeddings_input_dim == 6 or self.unet.config.projection_class_embeddings_input_dim == 10
+        else:
+            raise NotImplementedError
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        camera_embedding = camera_embedding.repeat(num_images_per_prompt, 1)
+        if do_classifier_free_guidance:
+            camera_embedding = torch.cat([
+                camera_embedding,
+                camera_embedding
+            ], dim=0)
+        return camera_embedding
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[List[PIL.Image.Image], torch.FloatTensor],
+        # elevation_cond: torch.FloatTensor,
+        # elevation: torch.FloatTensor,
+        # azimuth: torch.FloatTensor,
+        camera_embedding: torch.FloatTensor,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        normal_cond: Optional[Union[List[PIL.Image.Image], torch.FloatTensor]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        assert batch_size >= self.num_views and batch_size % self.num_views == 0
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale != 1.0
+        # 3. Encode input image
+        if isinstance(image, list):
+            image_pil = image
+        elif isinstance(image, torch.Tensor):
+            image_pil = [TF.to_pil_image(image[i]) for i in range(image.shape[0])]
+        image_embeddings, image_latents = self._encode_image(image_pil, device, num_images_per_prompt, do_classifier_free_guidance)
+        if normal_cond is not None:
+            if isinstance(normal_cond, list):
+                normal_cond_pil = normal_cond
+            elif isinstance(normal_cond, torch.Tensor):
+                normal_cond_pil = [TF.to_pil_image(normal_cond[i]) for i in range(normal_cond.shape[0])]
+            _, image_latents = self._encode_image(normal_cond_pil, device, num_images_per_prompt, do_classifier_free_guidance)
+        # assert len(elevation_cond) == batch_size and len(elevation) == batch_size and len(azimuth) == batch_size
+        # camera_embeddings = self.prepare_camera_condition(elevation_cond, elevation, azimuth, do_classifier_free_guidance=do_classifier_free_guidance, num_images_per_prompt=num_images_per_prompt)
+        assert len(camera_embedding) == batch_size
+        camera_embeddings = self.prepare_camera_embedding(camera_embedding, do_classifier_free_guidance=do_classifier_free_guidance, num_images_per_prompt=num_images_per_prompt)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = torch.cat([
+                    latent_model_input, image_latents
+                ], dim=1)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings, class_labels=camera_embeddings).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            if num_channels_latents == 8:
+                latents = torch.cat([latents[:, :4], latents[:, 4:]], dim=0)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+torch==1.12.1
+torchvision==0.13.1
+diffusers[torch]==0.11.1
+transformers>=4.25.1
+bitsandbytes==0.35.4
+decord==0.6.0
+pytorch-lightning<2
+omegaconf==2.2.3
+nerfacc==0.3.3
+trimesh==3.9.8
+pyhocon==0.3.57
+icecream==2.1.0
+PyMCubes==0.1.2
+xformers
+accelerate
+modelcards
+einops
+ftfy
+piq
+matplotlib
+opencv-python
+imageio
+imageio-ffmpeg
+scipy
+pyransac3d
+torch_efficient_distloss
+tensorboard
+rembg
+segment_anything

run_test.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ accelerate launch --config_file 1gpu.yaml test_mvdiffusion_seq.py --config configs/mvdiffusion-joint-ortho-6views.yaml

utils/misc.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from omegaconf import OmegaConf
+from packaging import version
+# ============ Register OmegaConf Recolvers ============= #
+OmegaConf.register_new_resolver('calc_exp_lr_decay_rate', lambda factor, n: factor**(1./n))
+OmegaConf.register_new_resolver('add', lambda a, b: a + b)
+OmegaConf.register_new_resolver('sub', lambda a, b: a - b)
+OmegaConf.register_new_resolver('mul', lambda a, b: a * b)
+OmegaConf.register_new_resolver('div', lambda a, b: a / b)
+OmegaConf.register_new_resolver('idiv', lambda a, b: a // b)
+OmegaConf.register_new_resolver('basename', lambda p: os.path.basename(p))
+# ======================================================= #
+def prompt(question):
+    inp = input(f"{question} (y/n)").lower().strip()
+    if inp and inp == 'y':
+        return True
+    if inp and inp == 'n':
+        return False
+    return prompt(question)
+def load_config(*yaml_files, cli_args=[]):
+    yaml_confs = [OmegaConf.load(f) for f in yaml_files]
+    cli_conf = OmegaConf.from_cli(cli_args)
+    conf = OmegaConf.merge(*yaml_confs, cli_conf)
+    OmegaConf.resolve(conf)
+    return conf
+def config_to_primitive(config, resolve=True):
+    return OmegaConf.to_container(config, resolve=resolve)
+def dump_config(path, config):
+    with open(path, 'w') as fp:
+        OmegaConf.save(config=config, f=fp)
+def get_rank():
+    # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing,
+    # therefore LOCAL_RANK needs to be checked first
+    rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID", "JSM_NAMESPACE_RANK")
+    for key in rank_keys:
+        rank = os.environ.get(key)
+        if rank is not None:
+            return int(rank)
+    return 0
+def parse_version(ver):
+    return version.parse(ver)