Spaces:

Adarsh7700
/

duplicate-repo

Running

App Files Files Community

Adarsh Patel commited on Nov 8, 2024

Commit

4baad62

1 Parent(s): 93adcf7

files added

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +1 -0
LICENSE +201 -0
README.md +7 -5
app.py +391 -0
configs/instant-mesh-base.yaml +22 -0
configs/instant-mesh-large.yaml +22 -0
configs/instant-nerf-base.yaml +21 -0
configs/instant-nerf-large.yaml +21 -0
examples/bird.jpg +0 -0
examples/bubble_mart_blue.png +0 -0
examples/cake.jpg +0 -0
examples/cartoon_dinosaur.png +0 -0
examples/cartoon_panda.png +3 -0
examples/chair_armed.png +0 -0
examples/chair_comfort.jpg +0 -0
examples/chair_wood.jpg +0 -0
examples/chest.jpg +0 -0
examples/cute_horse.jpg +0 -0
examples/cute_tiger.jpg +0 -0
examples/earphone.jpg +0 -0
examples/fox.jpg +0 -0
examples/fruit.jpg +0 -0
examples/fruit_elephant.jpg +0 -0
examples/genshin_building.png +0 -0
examples/genshin_teapot.png +0 -0
examples/hatsune_miku.png +0 -0
examples/house2.jpg +0 -0
examples/mushroom_teapot.jpg +0 -0
examples/pikachu.png +0 -0
examples/plant.jpg +0 -0
examples/robot.jpg +0 -0
examples/sea_turtle.png +0 -0
examples/skating_shoe.jpg +0 -0
examples/sorting_board.png +0 -0
examples/sword.png +0 -0
examples/toy_car.jpg +0 -0
examples/watermelon.png +0 -0
examples/whitedog.png +0 -0
examples/x_teapot.jpg +0 -0
examples/x_toyduck.jpg +0 -0
requirements.txt +23 -0
src/__init__.py +0 -0
src/data/__init__.py +0 -0
src/data/objaverse.py +329 -0
src/model.py +310 -0
src/model_mesh.py +325 -0
src/models/__init__.py +0 -0
src/models/decoder/__init__.py +0 -0
src/models/decoder/transformer.py +123 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/cartoon_panda.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: Duplicate Repo
-emoji: 🐨
-colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 5.5.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: InstantMesh
+emoji: 📚
+colorFrom: indigo
+colorTo: green
 sdk: gradio
+sdk_version: 4.26.0
 app_file: app.py
 pinned: false
+short_description: Create a 3D model from an image in 10 seconds!
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import spaces
+import os
+import imageio
+import numpy as np
+import torch
+import rembg
+from PIL import Image
+from torchvision.transforms import v2
+from pytorch_lightning import seed_everything
+from omegaconf import OmegaConf
+from einops import rearrange, repeat
+from tqdm import tqdm
+from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
+from src.utils.train_util import instantiate_from_config
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    get_zero123plus_input_cameras,
+    get_circular_camera_poses,
+)
+from src.utils.mesh_util import save_obj, save_glb
+from src.utils.infer_util import remove_background, resize_foreground, images_to_video
+import tempfile
+from functools import partial
+from huggingface_hub import hf_hub_download
+import gradio as gr
+def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
+    """
+    Get the rendering camera parameters.
+    """
+    c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
+    if is_flexicubes:
+        cameras = torch.linalg.inv(c2ws)
+        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
+    else:
+        extrinsics = c2ws.flatten(-2)
+        intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
+        cameras = torch.cat([extrinsics, intrinsics], dim=-1)
+        cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
+    return cameras
+def images_to_video(images, output_path, fps=30):
+    # images: (N, C, H, W)
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    frames = []
+    for i in range(images.shape[0]):
+        frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255)
+        assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \
+            f"Frame shape mismatch: {frame.shape} vs {images.shape}"
+        assert frame.min() >= 0 and frame.max() <= 255, \
+            f"Frame value out of range: {frame.min()} ~ {frame.max()}"
+        frames.append(frame)
+    imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264')
+###############################################################################
+# Configuration.
+###############################################################################
+import shutil
+def find_cuda():
+    # Check if CUDA_HOME or CUDA_PATH environment variables are set
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home and os.path.exists(cuda_home):
+        return cuda_home
+    # Search for the nvcc executable in the system's PATH
+    nvcc_path = shutil.which('nvcc')
+    if nvcc_path:
+        # Remove the 'bin/nvcc' part to get the CUDA installation path
+        cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
+        return cuda_path
+    return None
+cuda_path = find_cuda()
+if cuda_path:
+    print(f"CUDA installation found at: {cuda_path}")
+else:
+    print("CUDA installation not found")
+config_path = 'configs/instant-mesh-large.yaml'
+config = OmegaConf.load(config_path)
+config_name = os.path.basename(config_path).replace('.yaml', '')
+model_config = config.model_config
+infer_config = config.infer_config
+IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False
+device = torch.device('cuda')
+# load diffusion model
+print('Loading diffusion model ...')
+pipeline = DiffusionPipeline.from_pretrained(
+    "sudo-ai/zero123plus-v1.2",
+    custom_pipeline="zero123plus",
+    torch_dtype=torch.float16,
+)
+pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+    pipeline.scheduler.config, timestep_spacing='trailing'
+)
+# load custom white-background UNet
+unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")
+state_dict = torch.load(unet_ckpt_path, map_location='cpu')
+pipeline.unet.load_state_dict(state_dict, strict=True)
+pipeline = pipeline.to(device)
+# load reconstruction model
+print('Loading reconstruction model ...')
+model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model")
+model = instantiate_from_config(model_config)
+state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+print('Loading Finished!')
+def check_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image uploaded!")
+def preprocess(input_image, do_remove_background):
+    rembg_session = rembg.new_session() if do_remove_background else None
+    if do_remove_background:
+        input_image = remove_background(input_image, rembg_session)
+        input_image = resize_foreground(input_image, 0.85)
+    return input_image
+@spaces.GPU
+def generate_mvs(input_image, sample_steps, sample_seed):
+    seed_everything(sample_seed)
+    # sampling
+    z123_image = pipeline(
+        input_image,
+        num_inference_steps=sample_steps
+    ).images[0]
+    show_image = np.asarray(z123_image, dtype=np.uint8)
+    show_image = torch.from_numpy(show_image)     # (960, 640, 3)
+    show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
+    show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
+    show_image = Image.fromarray(show_image.numpy())
+    return z123_image, show_image
+@spaces.GPU
+def make3d(images):
+    global model
+    if IS_FLEXICUBES:
+        model.init_flexicubes_geometry(device, use_renderer=False)
+    model = model.eval()
+    images = np.asarray(images, dtype=np.float32) / 255.0
+    images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()     # (3, 960, 640)
+    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)        # (6, 3, 320, 320)
+    input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
+    render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device)
+    images = images.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
+    mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
+    print(mesh_fpath)
+    mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
+    mesh_dirname = os.path.dirname(mesh_fpath)
+    video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
+    mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        # # get video
+        # chunk_size = 20 if IS_FLEXICUBES else 1
+        # render_size = 384
+        # frames = []
+        # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
+        #     if IS_FLEXICUBES:
+        #         frame = model.forward_geometry(
+        #             planes,
+        #             render_cameras[:, i:i+chunk_size],
+        #             render_size=render_size,
+        #         )['img']
+        #     else:
+        #         frame = model.synthesizer(
+        #             planes,
+        #             cameras=render_cameras[:, i:i+chunk_size],
+        #             render_size=render_size,
+        #         )['images_rgb']
+        #     frames.append(frame)
+        # frames = torch.cat(frames, dim=1)
+        # images_to_video(
+        #     frames[0],
+        #     video_fpath,
+        #     fps=30,
+        # )
+        # print(f"Video saved to {video_fpath}")
+        # get mesh
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=False,
+            **infer_config,
+        )
+        vertices, faces, vertex_colors = mesh_out
+        vertices = vertices[:, [1, 2, 0]]
+        save_glb(vertices, faces, vertex_colors, mesh_glb_fpath)
+        save_obj(vertices, faces, vertex_colors, mesh_fpath)
+        print(f"Mesh saved to {mesh_fpath}")
+    return mesh_fpath, mesh_glb_fpath
+_HEADER_ = '''
+<h2><b>Welcome to 3DFusion!</b></h2>
+<h2><a href='https://github.com/TencentARC/InstantMesh' target='_blank'><b>3D Mesh Generation from Single Images with 3DFusion</b></a></h2>
+3DFusion is a cutting-edge, efficient 3D mesh generation tool based on the powerful LRM/Instant3D architecture.
+Code and Original Framework: <a href='https://github.com/TencentARC/InstantMesh' target='_blank'>InstantMesh GitHub</a>. Technical report: <a href='https://arxiv.org/abs/2404.07191' target='_blank'>ArXiv</a>.
+❗️**Important Notes:**
+- This demo exports both `.obj` and `.glb` meshes, including vertex colors.
+- The 3D mesh generation depends on the quality of generated multi-view images, so try different seed values (default: 42) for optimal results.
+'''
+_CITE_ = r"""
+If you find **3DFusion** helpful, please give a ⭐ to the original <a href='https://github.com/TencentARC/InstantMesh' target='_blank'>InstantMesh repository</a>. We appreciate the work of the TencentARC team! [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/InstantMesh?style=social)](https://github.com/TencentARC/InstantMesh)
+---
+📝 **Citation**
+If you use this work for research or applications, cite it as follows:
+```bibtex
+@article{xu2024instantmesh,
+  title={InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models},
+  author={Xu, Jiale and Cheng, Weihao and Gao, Yiming and Wang, Xintao and Gao, Shenghua and Shan, Ying},
+  journal={arXiv preprint arXiv:2404.07191},
+  year={2024}
+}
+```
+📋 **License**
+Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/spaces/TencentARC/InstantMesh/blob/main/LICENSE) for details.
+📧 **Contact**
+If you have any questions, feel free to open a discussion or contact us at <b>bluestyle928@gmail.com</b>.
+"""
+with gr.Blocks() as demo:
+    gr.Markdown(_HEADER_)
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Input Image",
+                    image_mode="RGBA",
+                    sources="upload",
+                    #width=256,
+                    #height=256,
+                    type="pil",
+                    elem_id="content_image",
+                )
+                processed_image = gr.Image(
+                    label="Processed Image",
+                    image_mode="RGBA",
+                    #width=256,
+                    #height=256,
+                    type="pil",
+                    interactive=False
+                )
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(
+                        label="Remove Background", value=True
+                    )
+                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+                    sample_steps = gr.Slider(
+                        label="Sample Steps",
+                        minimum=30,
+                        maximum=75,
+                        value=75,
+                        step=5
+                    )
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+            with gr.Row(variant="panel"):
+                gr.Examples(
+                    examples=[
+                        os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))
+                    ],
+                    inputs=[input_image],
+                    label="Examples",
+                    cache_examples=False,
+                    examples_per_page=16
+                )
+        with gr.Column():
+            with gr.Row():
+                with gr.Column():
+                    mv_show_images = gr.Image(
+                        label="Generated Multi-views",
+                        type="pil",
+                        width=379,
+                        interactive=False
+                    )
+                # with gr.Column():
+                #     output_video = gr.Video(
+                #         label="video", format="mp4",
+                #         width=379,
+                #         autoplay=True,
+                #         interactive=False
+                #     )
+            with gr.Row():
+                with gr.Tab("OBJ"):
+                    output_model_obj = gr.Model3D(
+                        label="Output Model (OBJ Format)",
+                        interactive=False,
+                    )
+                    gr.Markdown("Note: Downloaded .obj model will be flipped. Export .glb instead or manually flip it before usage.")
+                with gr.Tab("GLB"):
+                    output_model_glb = gr.Model3D(
+                        label="Output Model (GLB Format)",
+                        interactive=False,
+                    )
+                    gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
+            with gr.Row():
+                gr.Markdown('''Try a different <b>seed value</b> if the result is unsatisfying (Default: 42).''')
+    gr.Markdown(_CITE_)
+    mv_images = gr.State()
+    submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=preprocess,
+        inputs=[input_image, do_remove_background],
+        outputs=[processed_image],
+    ).success(
+        fn=generate_mvs,
+        inputs=[processed_image, sample_steps, sample_seed],
+        outputs=[mv_images, mv_show_images]
+    ).success(
+        fn=make3d,
+        inputs=[mv_images],
+        outputs=[output_model_obj, output_model_glb]
+    )
+demo.launch()

configs/instant-mesh-base.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+model_config:
+  target: src.models.lrm_mesh.InstantMesh
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 12
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 40
+    rendering_samples_per_ray: 96
+    grid_res: 128
+    grid_scale: 2.1
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_mesh_base.ckpt
+  texture_resolution: 1024
+  render_resolution: 512

configs/instant-mesh-large.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+model_config:
+  target: src.models.lrm_mesh.InstantMesh
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 16
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 80
+    rendering_samples_per_ray: 128
+    grid_res: 128
+    grid_scale: 2.1
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_mesh_large.ckpt
+  texture_resolution: 1024
+  render_resolution: 512

configs/instant-nerf-base.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model_config:
+  target: src.models.lrm.InstantNeRF
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 12
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 40
+    rendering_samples_per_ray: 96
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_nerf_base.ckpt
+  mesh_threshold: 10.0
+  mesh_resolution: 256
+  render_resolution: 384

configs/instant-nerf-large.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model_config:
+  target: src.models.lrm.InstantNeRF
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 16
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 80
+    rendering_samples_per_ray: 128
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/instant_nerf_large.ckpt
+  mesh_threshold: 10.0
+  mesh_resolution: 256
+  render_resolution: 384

examples/bird.jpg ADDED Viewed

examples/bubble_mart_blue.png ADDED Viewed

examples/cake.jpg ADDED Viewed

examples/cartoon_dinosaur.png ADDED Viewed

examples/cartoon_panda.png ADDED Viewed

Git LFS Details

SHA256: c82fea6ac66b782b2aa1c6bd133447b5f54f688c7eb44998c4b00f190d47b2b7
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

examples/chair_armed.png ADDED Viewed

examples/chair_comfort.jpg ADDED Viewed

examples/chair_wood.jpg ADDED Viewed

examples/chest.jpg ADDED Viewed

examples/cute_horse.jpg ADDED Viewed

examples/cute_tiger.jpg ADDED Viewed

examples/earphone.jpg ADDED Viewed

examples/fox.jpg ADDED Viewed

examples/fruit.jpg ADDED Viewed

examples/fruit_elephant.jpg ADDED Viewed

examples/genshin_building.png ADDED Viewed

examples/genshin_teapot.png ADDED Viewed

examples/hatsune_miku.png ADDED Viewed

examples/house2.jpg ADDED Viewed

examples/mushroom_teapot.jpg ADDED Viewed

examples/pikachu.png ADDED Viewed

examples/plant.jpg ADDED Viewed

examples/robot.jpg ADDED Viewed

examples/sea_turtle.png ADDED Viewed

examples/skating_shoe.jpg ADDED Viewed

examples/sorting_board.png ADDED Viewed

examples/sword.png ADDED Viewed

examples/toy_car.jpg ADDED Viewed

examples/watermelon.png ADDED Viewed

examples/whitedog.png ADDED Viewed

examples/x_teapot.jpg ADDED Viewed

examples/x_toyduck.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+torch==2.1.0
+torchvision==0.16.0
+torchaudio==2.1.0
+pytorch-lightning==2.1.2
+einops
+omegaconf
+deepspeed
+torchmetrics
+webdataset
+accelerate
+tensorboard
+PyMCubes
+trimesh
+rembg
+transformers==4.34.1
+diffusers==0.19.3
+bitsandbytes
+imageio[ffmpeg]
+xatlas
+plyfile
+xformers==0.0.22.post7
+git+https://github.com/NVlabs/nvdiffrast/
+huggingface-hub

src/__init__.py ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/objaverse.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import os, sys
+import math
+import json
+import importlib
+from pathlib import Path
+import cv2
+import random
+import numpy as np
+from PIL import Image
+import webdataset as wds
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torchvision import transforms
+from src.utils.train_util import instantiate_from_config
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    center_looking_at_camera_pose,
+    get_surrounding_views,
+)
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(
+        self,
+        batch_size=8,
+        num_workers=4,
+        train=None,
+        validation=None,
+        test=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.dataset_configs = dict()
+        if train is not None:
+            self.dataset_configs['train'] = train
+        if validation is not None:
+            self.dataset_configs['validation'] = validation
+        if test is not None:
+            self.dataset_configs['test'] = test
+    def setup(self, stage):
+        if stage in ['fit']:
+            self.datasets = dict((k, instantiate_from_config(self.dataset_configs[k])) for k in self.dataset_configs)
+        else:
+            raise NotImplementedError
+    def train_dataloader(self):
+        sampler = DistributedSampler(self.datasets['train'])
+        return wds.WebLoader(self.datasets['train'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def val_dataloader(self):
+        sampler = DistributedSampler(self.datasets['validation'])
+        return wds.WebLoader(self.datasets['validation'], batch_size=1, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def test_dataloader(self):
+        return wds.WebLoader(self.datasets['test'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+class ObjaverseData(Dataset):
+    def __init__(self,
+        root_dir='objaverse/',
+        meta_fname='valid_paths.json',
+        input_image_dir='rendering_random_32views',
+        target_image_dir='rendering_random_32views',
+        input_view_num=6,
+        target_view_num=2,
+        total_view_n=32,
+        fov=50,
+        camera_rotation=True,
+        validation=False,
+    ):
+        self.root_dir = Path(root_dir)
+        self.input_image_dir = input_image_dir
+        self.target_image_dir = target_image_dir
+        self.input_view_num = input_view_num
+        self.target_view_num = target_view_num
+        self.total_view_n = total_view_n
+        self.fov = fov
+        self.camera_rotation = camera_rotation
+        with open(os.path.join(root_dir, meta_fname)) as f:
+            filtered_dict = json.load(f)
+        paths = filtered_dict['good_objs']
+        self.paths = paths
+        self.depth_scale = 4.0
+        total_objects = len(self.paths)
+        print('============= length of dataset %d =============' % len(self.paths))
+    def __len__(self):
+        return len(self.paths)
+    def load_im(self, path, color):
+        '''
+        replace background pixel with random color in rendering
+        '''
+        pil_img = Image.open(path)
+        image = np.asarray(pil_img, dtype=np.float32) / 255.
+        alpha = image[:, :, 3:]
+        image = image[:, :, :3] * alpha + color * (1 - alpha)
+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
+        alpha = torch.from_numpy(alpha).permute(2, 0, 1).contiguous().float()
+        return image, alpha
+    def __getitem__(self, index):
+        # load data
+        while True:
+            input_image_path = os.path.join(self.root_dir, self.input_image_dir, self.paths[index])
+            target_image_path = os.path.join(self.root_dir, self.target_image_dir, self.paths[index])
+            indices = np.random.choice(range(self.total_view_n), self.input_view_num + self.target_view_num, replace=False)
+            input_indices = indices[:self.input_view_num]
+            target_indices = indices[self.input_view_num:]
+            '''background color, default: white'''
+            bg_white = [1., 1., 1.]
+            bg_black = [0., 0., 0.]
+            image_list = []
+            alpha_list = []
+            depth_list = []
+            normal_list = []
+            pose_list = []
+            try:
+                input_cameras = np.load(os.path.join(input_image_path, 'cameras.npz'))['cam_poses']
+                for idx in input_indices:
+                    image, alpha = self.load_im(os.path.join(input_image_path, '%03d.png' % idx), bg_white)
+                    normal, _ = self.load_im(os.path.join(input_image_path, '%03d_normal.png' % idx), bg_black)
+                    depth = cv2.imread(os.path.join(input_image_path, '%03d_depth.png' % idx), cv2.IMREAD_UNCHANGED) / 255.0 * self.depth_scale
+                    depth = torch.from_numpy(depth).unsqueeze(0)
+                    pose = input_cameras[idx]
+                    pose = np.concatenate([pose, np.array([[0, 0, 0, 1]])], axis=0)
+                    image_list.append(image)
+                    alpha_list.append(alpha)
+                    depth_list.append(depth)
+                    normal_list.append(normal)
+                    pose_list.append(pose)
+                target_cameras = np.load(os.path.join(target_image_path, 'cameras.npz'))['cam_poses']
+                for idx in target_indices:
+                    image, alpha = self.load_im(os.path.join(target_image_path, '%03d.png' % idx), bg_white)
+                    normal, _ = self.load_im(os.path.join(target_image_path, '%03d_normal.png' % idx), bg_black)
+                    depth = cv2.imread(os.path.join(target_image_path, '%03d_depth.png' % idx), cv2.IMREAD_UNCHANGED) / 255.0 * self.depth_scale
+                    depth = torch.from_numpy(depth).unsqueeze(0)
+                    pose = target_cameras[idx]
+                    pose = np.concatenate([pose, np.array([[0, 0, 0, 1]])], axis=0)
+                    image_list.append(image)
+                    alpha_list.append(alpha)
+                    depth_list.append(depth)
+                    normal_list.append(normal)
+                    pose_list.append(pose)
+            except Exception as e:
+                print(e)
+                index = np.random.randint(0, len(self.paths))
+                continue
+            break
+        images = torch.stack(image_list, dim=0).float()                 # (6+V, 3, H, W)
+        alphas = torch.stack(alpha_list, dim=0).float()                 # (6+V, 1, H, W)
+        depths = torch.stack(depth_list, dim=0).float()                 # (6+V, 1, H, W)
+        normals = torch.stack(normal_list, dim=0).float()               # (6+V, 3, H, W)
+        w2cs = torch.from_numpy(np.stack(pose_list, axis=0)).float()    # (6+V, 4, 4)
+        c2ws = torch.linalg.inv(w2cs).float()
+        normals = normals * 2.0 - 1.0
+        normals = F.normalize(normals, dim=1)
+        normals = (normals + 1.0) / 2.0
+        normals = torch.lerp(torch.zeros_like(normals), normals, alphas)
+        # random rotation along z axis
+        if self.camera_rotation:
+            degree = np.random.uniform(0, math.pi * 2)
+            rot = torch.tensor([
+                [np.cos(degree), -np.sin(degree), 0, 0],
+                [np.sin(degree), np.cos(degree), 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]).unsqueeze(0).float()
+            c2ws = torch.matmul(rot, c2ws)
+            # rotate normals
+            N, _, H, W = normals.shape
+            normals = normals * 2.0 - 1.0
+            normals = torch.matmul(rot[:, :3, :3], normals.view(N, 3, -1)).view(N, 3, H, W)
+            normals = F.normalize(normals, dim=1)
+            normals = (normals + 1.0) / 2.0
+            normals = torch.lerp(torch.zeros_like(normals), normals, alphas)
+        # random scaling
+        if np.random.rand() < 0.5:
+            scale = np.random.uniform(0.8, 1.0)
+            c2ws[:, :3, 3] *= scale
+            depths *= scale
+        # instrinsics of perspective cameras
+        K = FOV_to_intrinsics(self.fov)
+        Ks = K.unsqueeze(0).repeat(self.input_view_num + self.target_view_num, 1, 1).float()
+        data = {
+            'input_images': images[:self.input_view_num],     # (6, 3, H, W)
+            'input_alphas': alphas[:self.input_view_num],           # (6, 1, H, W)
+            'input_depths': depths[:self.input_view_num],           # (6, 1, H, W)
+            'input_normals': normals[:self.input_view_num],         # (6, 3, H, W)
+            'input_c2ws': c2ws_input[:self.input_view_num],         # (6, 4, 4)
+            'input_Ks': Ks[:self.input_view_num],                   # (6, 3, 3)
+            # lrm generator input and supervision
+            'target_images': images[self.input_view_num:],          # (V, 3, H, W)
+            'target_alphas': alphas[self.input_view_num:],          # (V, 1, H, W)
+            'target_depths': depths[self.input_view_num:],          # (V, 1, H, W)
+            'target_normals': normals[self.input_view_num:],        # (V, 3, H, W)
+            'target_c2ws': c2ws[self.input_view_num:],              # (V, 4, 4)
+            'target_Ks': Ks[self.input_view_num:],                  # (V, 3, 3)
+            'depth_available': 1,
+        }
+        return data
+class ValidationData(Dataset):
+    def __init__(self,
+        root_dir='objaverse/',
+        input_view_num=6,
+        input_image_size=256,
+        fov=50,
+    ):
+        self.root_dir = Path(root_dir)
+        self.input_view_num = input_view_num
+        self.input_image_size = input_image_size
+        self.fov = fov
+        self.paths = sorted(os.listdir(self.root_dir))
+        print('============= length of dataset %d =============' % len(self.paths))
+        cam_distance = 2.5
+        azimuths = np.array([30, 90, 150, 210, 270, 330])
+        elevations = np.array([30, -20, 30, -20, 30, -20])
+        azimuths = np.deg2rad(azimuths)
+        elevations = np.deg2rad(elevations)
+        x = cam_distance * np.cos(elevations) * np.cos(azimuths)
+        y = cam_distance * np.cos(elevations) * np.sin(azimuths)
+        z = cam_distance * np.sin(elevations)
+        cam_locations = np.stack([x, y, z], axis=-1)
+        cam_locations = torch.from_numpy(cam_locations).float()
+        c2ws = center_looking_at_camera_pose(cam_locations)
+        self.c2ws = c2ws.float()
+        self.Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(6, 1, 1).float()
+        render_c2ws = get_surrounding_views(M=8, radius=cam_distance)
+        render_Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(render_c2ws.shape[0], 1, 1)
+        self.render_c2ws = render_c2ws.float()
+        self.render_Ks = render_Ks.float()
+    def __len__(self):
+        return len(self.paths)
+    def load_im(self, path, color):
+        '''
+        replace background pixel with random color in rendering
+        '''
+        pil_img = Image.open(path)
+        pil_img = pil_img.resize((self.input_image_size, self.input_image_size), resample=Image.BICUBIC)
+        image = np.asarray(pil_img, dtype=np.float32) / 255.
+        if image.shape[-1] == 4:
+            alpha = image[:, :, 3:]
+            image = image[:, :, :3] * alpha + color * (1 - alpha)
+        else:
+            alpha = np.ones_like(image[:, :, :1])
+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
+        alpha = torch.from_numpy(alpha).permute(2, 0, 1).contiguous().float()
+        return image, alpha
+    def __getitem__(self, index):
+        # load data
+        input_image_path = os.path.join(self.root_dir, self.paths[index])
+        '''background color, default: white'''
+        # color = np.random.uniform(0.48, 0.52)
+        bkg_color = [1.0, 1.0, 1.0]
+        image_list = []
+        alpha_list = []
+        for idx in range(self.input_view_num):
+            image, alpha = self.load_im(os.path.join(input_image_path, f'{idx:03d}.png'), bkg_color)
+            image_list.append(image)
+            alpha_list.append(alpha)
+        images = torch.stack(image_list, dim=0).float()                     # (6+V, 3, H, W)
+        alphas = torch.stack(alpha_list, dim=0).float()                 # (6+V, 1, H, W)
+        data = {
+            'input_images': images,                 # (6, 3, H, W)
+            'input_alphas': alphas,             # (6, 1, H, W)
+            'input_c2ws': self.c2ws,            # (6, 4, 4)
+            'input_Ks': self.Ks,                # (6, 3, 3)
+            'render_c2ws': self.render_c2ws,
+            'render_Ks': self.render_Ks,
+        }
+        return data

src/model.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.transforms import v2
+from torchvision.utils import make_grid, save_image
+from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+import pytorch_lightning as pl
+from einops import rearrange, repeat
+from src.utils.train_util import instantiate_from_config
+class MVRecon(pl.LightningModule):
+    def __init__(
+        self,
+        lrm_generator_config,
+        lrm_path=None,
+        input_size=256,
+        render_size=192,
+    ):
+        super(MVRecon, self).__init__()
+        self.input_size = input_size
+        self.render_size = render_size
+        # init modules
+        self.lrm_generator = instantiate_from_config(lrm_generator_config)
+        if lrm_path is not None:
+            lrm_ckpt = torch.load(lrm_path)
+            self.lrm_generator.load_state_dict(lrm_ckpt['weights'], strict=False)
+        self.lpips = LearnedPerceptualImagePatchSimilarity(net_type='vgg')
+        self.validation_step_outputs = []
+    def on_fit_start(self):
+        if self.global_rank == 0:
+            os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True)
+            os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True)
+    def prepare_batch_data(self, batch):
+        lrm_generator_input = {}
+        render_gt = {}   # for supervision
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        # input cameras and render cameras
+        input_c2ws = batch['input_c2ws'].flatten(-2)
+        input_Ks = batch['input_Ks'].flatten(-2)
+        target_c2ws = batch['target_c2ws'].flatten(-2)
+        target_Ks = batch['target_Ks'].flatten(-2)
+        render_cameras_input = torch.cat([input_c2ws, input_Ks], dim=-1)
+        render_cameras_target = torch.cat([target_c2ws, target_Ks], dim=-1)
+        render_cameras = torch.cat([render_cameras_input, render_cameras_target], dim=1)
+        input_extrinsics = input_c2ws[:, :, :12]
+        input_intrinsics = torch.stack([
+            input_Ks[:, :, 0], input_Ks[:, :, 4],
+            input_Ks[:, :, 2], input_Ks[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        # add noise to input cameras
+        cameras = cameras + torch.rand_like(cameras) * 0.04 - 0.02
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)
+        # target images
+        target_images = torch.cat([batch['input_images'], batch['target_images']], dim=1)
+        target_depths = torch.cat([batch['input_depths'], batch['target_depths']], dim=1)
+        target_alphas = torch.cat([batch['input_alphas'], batch['target_alphas']], dim=1)
+        # random crop
+        render_size = np.random.randint(self.render_size, 513)
+        target_images = v2.functional.resize(
+            target_images, render_size, interpolation=3, antialias=True).clamp(0, 1)
+        target_depths = v2.functional.resize(
+            target_depths, render_size, interpolation=0, antialias=True)
+        target_alphas = v2.functional.resize(
+            target_alphas, render_size, interpolation=0, antialias=True)
+        crop_params = v2.RandomCrop.get_params(
+            target_images, output_size=(self.render_size, self.render_size))
+        target_images = v2.functional.crop(target_images, *crop_params)
+        target_depths = v2.functional.crop(target_depths, *crop_params)[:, :, 0:1]
+        target_alphas = v2.functional.crop(target_alphas, *crop_params)[:, :, 0:1]
+        lrm_generator_input['render_size'] = render_size
+        lrm_generator_input['crop_params'] = crop_params
+        render_gt['target_images'] = target_images.to(self.device)
+        render_gt['target_depths'] = target_depths.to(self.device)
+        render_gt['target_alphas'] = target_alphas.to(self.device)
+        return lrm_generator_input, render_gt
+    def prepare_validation_batch_data(self, batch):
+        lrm_generator_input = {}
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        input_c2ws = batch['input_c2ws'].flatten(-2)
+        input_Ks = batch['input_Ks'].flatten(-2)
+        input_extrinsics = input_c2ws[:, :, :12]
+        input_intrinsics = torch.stack([
+            input_Ks[:, :, 0], input_Ks[:, :, 4],
+            input_Ks[:, :, 2], input_Ks[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        render_c2ws = batch['render_c2ws'].flatten(-2)
+        render_Ks = batch['render_Ks'].flatten(-2)
+        render_cameras = torch.cat([render_c2ws, render_Ks], dim=-1)
+        lrm_generator_input['render_cameras'] = render_cameras.to(self.device)
+        lrm_generator_input['render_size'] = 384
+        lrm_generator_input['crop_params'] = None
+        return lrm_generator_input
+    def forward_lrm_generator(
+        self,
+        images,
+        cameras,
+        render_cameras,
+        render_size=192,
+        crop_params=None,
+        chunk_size=1,
+    ):
+        planes = torch.utils.checkpoint.checkpoint(
+            self.lrm_generator.forward_planes,
+            images,
+            cameras,
+            use_reentrant=False,
+        )
+        frames = []
+        for i in range(0, render_cameras.shape[1], chunk_size):
+            frames.append(
+                torch.utils.checkpoint.checkpoint(
+                    self.lrm_generator.synthesizer,
+                    planes,
+                    cameras=render_cameras[:, i:i+chunk_size],
+                    render_size=render_size,
+                    crop_params=crop_params,
+                    use_reentrant=False
+                )
+            )
+        frames = {
+            k: torch.cat([r[k] for r in frames], dim=1)
+            for k in frames[0].keys()
+        }
+        return frames
+    def forward(self, lrm_generator_input):
+        images = lrm_generator_input['images']
+        cameras = lrm_generator_input['cameras']
+        render_cameras = lrm_generator_input['render_cameras']
+        render_size = lrm_generator_input['render_size']
+        crop_params = lrm_generator_input['crop_params']
+        out = self.forward_lrm_generator(
+            images,
+            cameras,
+            render_cameras,
+            render_size=render_size,
+            crop_params=crop_params,
+            chunk_size=1,
+        )
+        render_images = torch.clamp(out['images_rgb'], 0.0, 1.0)
+        render_depths = out['images_depth']
+        render_alphas = torch.clamp(out['images_weight'], 0.0, 1.0)
+        out = {
+            'render_images': render_images,
+            'render_depths': render_depths,
+            'render_alphas': render_alphas,
+        }
+        return out
+    def training_step(self, batch, batch_idx):
+        lrm_generator_input, render_gt = self.prepare_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        loss, loss_dict = self.compute_loss(render_out, render_gt)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        if self.global_step % 1000 == 0 and self.global_rank == 0:
+            B, N, C, H, W = render_gt['target_images'].shape
+            N_in = lrm_generator_input['images'].shape[1]
+            input_images = v2.functional.resize(
+                lrm_generator_input['images'], (H, W), interpolation=3, antialias=True).clamp(0, 1)
+            input_images = torch.cat(
+                [input_images, torch.ones(B, N-N_in, C, H, W).to(input_images)], dim=1)
+            input_images = rearrange(
+                input_images, 'b n c h w -> b c h (n w)')
+            target_images = rearrange(
+                render_gt['target_images'], 'b n c h w -> b c h (n w)')
+            render_images = rearrange(
+                render_out['render_images'], 'b n c h w -> b c h (n w)')
+            target_alphas = rearrange(
+                repeat(render_gt['target_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_alphas = rearrange(
+                repeat(render_out['render_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            target_depths = rearrange(
+                repeat(render_gt['target_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_depths = rearrange(
+                repeat(render_out['render_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            MAX_DEPTH = torch.max(target_depths)
+            target_depths = target_depths / MAX_DEPTH * target_alphas
+            render_depths = render_depths / MAX_DEPTH
+            grid = torch.cat([
+                input_images,
+                target_images, render_images,
+                target_alphas, render_alphas,
+                target_depths, render_depths,
+            ], dim=-2)
+            grid = make_grid(grid, nrow=target_images.shape[0], normalize=True, value_range=(0, 1))
+            save_image(grid, os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png'))
+        return loss
+    def compute_loss(self, render_out, render_gt):
+        # NOTE: the rgb value range of OpenLRM is [0, 1]
+        render_images = render_out['render_images']
+        target_images = render_gt['target_images'].to(render_images)
+        render_images = rearrange(render_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        target_images = rearrange(target_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        loss_mse = F.mse_loss(render_images, target_images)
+        loss_lpips = 2.0 * self.lpips(render_images, target_images)
+        render_alphas = render_out['render_alphas']
+        target_alphas = render_gt['target_alphas']
+        loss_mask = F.mse_loss(render_alphas, target_alphas)
+        loss = loss_mse + loss_lpips + loss_mask
+        prefix = 'train'
+        loss_dict = {}
+        loss_dict.update({f'{prefix}/loss_mse': loss_mse})
+        loss_dict.update({f'{prefix}/loss_lpips': loss_lpips})
+        loss_dict.update({f'{prefix}/loss_mask': loss_mask})
+        loss_dict.update({f'{prefix}/loss': loss})
+        return loss, loss_dict
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        lrm_generator_input = self.prepare_validation_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        render_images = render_out['render_images']
+        render_images = rearrange(render_images, 'b n c h w -> b c h (n w)')
+        self.validation_step_outputs.append(render_images)
+    def on_validation_epoch_end(self):
+        images = torch.cat(self.validation_step_outputs, dim=-1)
+        all_images = self.all_gather(images)
+        all_images = rearrange(all_images, 'r b c h w -> (r b) c h w')
+        if self.global_rank == 0:
+            image_path = os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')
+            grid = make_grid(all_images, nrow=1, normalize=True, value_range=(0, 1))
+            save_image(grid, image_path)
+            print(f"Saved image to {image_path}")
+        self.validation_step_outputs.clear()
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = []
+        lrm_params_fast, lrm_params_slow = [], []
+        for n, p in self.lrm_generator.named_parameters():
+            if 'adaLN_modulation' in n or 'camera_embedder' in n:
+                lrm_params_fast.append(p)
+            else:
+                lrm_params_slow.append(p)
+        params.append({"params": lrm_params_fast, "lr": lr, "weight_decay": 0.01 })
+        params.append({"params": lrm_params_slow, "lr": lr / 10.0, "weight_decay": 0.01 })
+        optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/4)
+        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

src/model_mesh.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.transforms import v2
+from torchvision.utils import make_grid, save_image
+from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+import pytorch_lightning as pl
+from einops import rearrange, repeat
+from src.utils.train_util import instantiate_from_config
+# Regulrarization loss for FlexiCubes
+def sdf_reg_loss_batch(sdf, all_edges):
+    sdf_f1x6x2 = sdf[:, all_edges.reshape(-1)].reshape(sdf.shape[0], -1, 2)
+    mask = torch.sign(sdf_f1x6x2[..., 0]) != torch.sign(sdf_f1x6x2[..., 1])
+    sdf_f1x6x2 = sdf_f1x6x2[mask]
+    sdf_diff = F.binary_cross_entropy_with_logits(
+        sdf_f1x6x2[..., 0], (sdf_f1x6x2[..., 1] > 0).float()) + \
+               F.binary_cross_entropy_with_logits(
+                   sdf_f1x6x2[..., 1], (sdf_f1x6x2[..., 0] > 0).float())
+    return sdf_diff
+class MVRecon(pl.LightningModule):
+    def __init__(
+        self,
+        lrm_generator_config,
+        input_size=256,
+        render_size=512,
+        init_ckpt=None,
+    ):
+        super(MVRecon, self).__init__()
+        self.input_size = input_size
+        self.render_size = render_size
+        # init modules
+        self.lrm_generator = instantiate_from_config(lrm_generator_config)
+        self.lpips = LearnedPerceptualImagePatchSimilarity(net_type='vgg')
+        # Load weights from pretrained MVRecon model, and use the mlp
+        # weights to initialize the weights of sdf and rgb mlps.
+        if init_ckpt is not None:
+            sd = torch.load(init_ckpt, map_location='cpu')['state_dict']
+            sd = {k: v for k, v in sd.items() if k.startswith('lrm_generator')}
+            sd_fc = {}
+            for k, v in sd.items():
+                if k.startswith('lrm_generator.synthesizer.decoder.net.'):
+                    if k.startswith('lrm_generator.synthesizer.decoder.net.6.'):    # last layer
+                        # Here we assume the density filed's isosurface threshold is t,
+                        # we reverse the sign of density filed to initialize SDF field.
+                        # -(w*x + b - t) = (-w)*x + (t - b)
+                        if 'weight' in k:
+                            sd_fc[k.replace('net.', 'net_sdf.')] = -v[0:1]
+                        else:
+                            sd_fc[k.replace('net.', 'net_sdf.')] = 3.0 - v[0:1]
+                        sd_fc[k.replace('net.', 'net_rgb.')] = v[1:4]
+                    else:
+                        sd_fc[k.replace('net.', 'net_sdf.')] = v
+                        sd_fc[k.replace('net.', 'net_rgb.')] = v
+                else:
+                    sd_fc[k] = v
+            sd_fc = {k.replace('lrm_generator.', ''): v for k, v in sd_fc.items()}
+            # missing `net_deformation` and `net_weight` parameters
+            self.lrm_generator.load_state_dict(sd_fc, strict=False)
+            print(f'Loaded weights from {init_ckpt}')
+        self.validation_step_outputs = []
+    def on_fit_start(self):
+        device = torch.device(f'cuda:{self.global_rank}')
+        self.lrm_generator.init_flexicubes_geometry(device)
+        if self.global_rank == 0:
+            os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True)
+            os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True)
+    def prepare_batch_data(self, batch):
+        lrm_generator_input = {}
+        render_gt = {}
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        # input cameras and render cameras
+        input_c2ws = batch['input_c2ws']
+        input_Ks = batch['input_Ks']
+        target_c2ws = batch['target_c2ws']
+        render_c2ws = torch.cat([input_c2ws, target_c2ws], dim=1)
+        render_w2cs = torch.linalg.inv(render_c2ws)
+        input_extrinsics = input_c2ws.flatten(-2)
+        input_extrinsics = input_extrinsics[:, :, :12]
+        input_intrinsics = input_Ks.flatten(-2)
+        input_intrinsics = torch.stack([
+            input_intrinsics[:, :, 0], input_intrinsics[:, :, 4],
+            input_intrinsics[:, :, 2], input_intrinsics[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        # add noise to input_cameras
+        cameras = cameras + torch.rand_like(cameras) * 0.04 - 0.02
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        lrm_generator_input['render_cameras'] = render_w2cs.to(self.device)
+        # target images
+        target_images = torch.cat([batch['input_images'], batch['target_images']], dim=1)
+        target_depths = torch.cat([batch['input_depths'], batch['target_depths']], dim=1)
+        target_alphas = torch.cat([batch['input_alphas'], batch['target_alphas']], dim=1)
+        target_normals = torch.cat([batch['input_normals'], batch['target_normals']], dim=1)
+        render_size = self.render_size
+        target_images = v2.functional.resize(
+            target_images, render_size, interpolation=3, antialias=True).clamp(0, 1)
+        target_depths = v2.functional.resize(
+            target_depths, render_size, interpolation=0, antialias=True)
+        target_alphas = v2.functional.resize(
+            target_alphas, render_size, interpolation=0, antialias=True)
+        target_normals = v2.functional.resize(
+            target_normals, render_size, interpolation=3, antialias=True)
+        lrm_generator_input['render_size'] = render_size
+        render_gt['target_images'] = target_images.to(self.device)
+        render_gt['target_depths'] = target_depths.to(self.device)
+        render_gt['target_alphas'] = target_alphas.to(self.device)
+        render_gt['target_normals'] = target_normals.to(self.device)
+        return lrm_generator_input, render_gt
+    def prepare_validation_batch_data(self, batch):
+        lrm_generator_input = {}
+        # input images
+        images = batch['input_images']
+        images = v2.functional.resize(
+            images, self.input_size, interpolation=3, antialias=True).clamp(0, 1)
+        lrm_generator_input['images'] = images.to(self.device)
+        # input cameras
+        input_c2ws = batch['input_c2ws'].flatten(-2)
+        input_Ks = batch['input_Ks'].flatten(-2)
+        input_extrinsics = input_c2ws[:, :, :12]
+        input_intrinsics = torch.stack([
+            input_Ks[:, :, 0], input_Ks[:, :, 4],
+            input_Ks[:, :, 2], input_Ks[:, :, 5],
+        ], dim=-1)
+        cameras = torch.cat([input_extrinsics, input_intrinsics], dim=-1)
+        lrm_generator_input['cameras'] = cameras.to(self.device)
+        # render cameras
+        render_c2ws = batch['render_c2ws']
+        render_w2cs = torch.linalg.inv(render_c2ws)
+        lrm_generator_input['render_cameras'] = render_w2cs.to(self.device)
+        lrm_generator_input['render_size'] = 384
+        return lrm_generator_input
+    def forward_lrm_generator(self, images, cameras, render_cameras, render_size=512):
+        planes = torch.utils.checkpoint.checkpoint(
+            self.lrm_generator.forward_planes,
+            images,
+            cameras,
+            use_reentrant=False,
+        )
+        out = self.lrm_generator.forward_geometry(
+            planes,
+            render_cameras,
+            render_size,
+        )
+        return out
+    def forward(self, lrm_generator_input):
+        images = lrm_generator_input['images']
+        cameras = lrm_generator_input['cameras']
+        render_cameras = lrm_generator_input['render_cameras']
+        render_size = lrm_generator_input['render_size']
+        out = self.forward_lrm_generator(
+            images, cameras, render_cameras, render_size=render_size)
+        return out
+    def training_step(self, batch, batch_idx):
+        lrm_generator_input, render_gt = self.prepare_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        loss, loss_dict = self.compute_loss(render_out, render_gt)
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        if self.global_step % 1000 == 0 and self.global_rank == 0:
+            B, N, C, H, W = render_gt['target_images'].shape
+            N_in = lrm_generator_input['images'].shape[1]
+            target_images = rearrange(
+                render_gt['target_images'], 'b n c h w -> b c h (n w)')
+            render_images = rearrange(
+                render_out['img'], 'b n c h w -> b c h (n w)')
+            target_alphas = rearrange(
+                repeat(render_gt['target_alphas'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_alphas = rearrange(
+                repeat(render_out['mask'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            target_depths = rearrange(
+                repeat(render_gt['target_depths'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            render_depths = rearrange(
+                repeat(render_out['depth'], 'b n 1 h w -> b n 3 h w'), 'b n c h w -> b c h (n w)')
+            target_normals = rearrange(
+                render_gt['target_normals'], 'b n c h w -> b c h (n w)')
+            render_normals = rearrange(
+                render_out['normal'], 'b n c h w -> b c h (n w)')
+            MAX_DEPTH = torch.max(target_depths)
+            target_depths = target_depths / MAX_DEPTH * target_alphas
+            render_depths = render_depths / MAX_DEPTH
+            grid = torch.cat([
+                target_images, render_images,
+                target_alphas, render_alphas,
+                target_depths, render_depths,
+                target_normals, render_normals,
+            ], dim=-2)
+            grid = make_grid(grid, nrow=target_images.shape[0], normalize=True, value_range=(0, 1))
+            image_path = os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png')
+            save_image(grid, image_path)
+            print(f"Saved image to {image_path}")
+        return loss
+    def compute_loss(self, render_out, render_gt):
+        # NOTE: the rgb value range of OpenLRM is [0, 1]
+        render_images = render_out['img']
+        target_images = render_gt['target_images'].to(render_images)
+        render_images = rearrange(render_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        target_images = rearrange(target_images, 'b n ... -> (b n) ...') * 2.0 - 1.0
+        loss_mse = F.mse_loss(render_images, target_images)
+        loss_lpips = 2.0 * self.lpips(render_images, target_images)
+        render_alphas = render_out['mask']
+        target_alphas = render_gt['target_alphas']
+        loss_mask = F.mse_loss(render_alphas, target_alphas)
+        render_depths = render_out['depth']
+        target_depths = render_gt['target_depths']
+        loss_depth = 0.5 * F.l1_loss(render_depths[target_alphas>0], target_depths[target_alphas>0])
+        render_normals = render_out['normal'] * 2.0 - 1.0
+        target_normals = render_gt['target_normals'] * 2.0 - 1.0
+        similarity = (render_normals * target_normals).sum(dim=-3).abs()
+        normal_mask = target_alphas.squeeze(-3)
+        loss_normal = 1 - similarity[normal_mask>0].mean()
+        loss_normal = 0.2 * loss_normal
+        # flexicubes regularization loss
+        sdf = render_out['sdf']
+        sdf_reg_loss = render_out['sdf_reg_loss']
+        sdf_reg_loss_entropy = sdf_reg_loss_batch(sdf, self.lrm_generator.geometry.all_edges).mean() * 0.01
+        _, flexicubes_surface_reg, flexicubes_weights_reg = sdf_reg_loss
+        flexicubes_surface_reg = flexicubes_surface_reg.mean() * 0.5
+        flexicubes_weights_reg = flexicubes_weights_reg.mean() * 0.1
+        loss_reg = sdf_reg_loss_entropy + flexicubes_surface_reg + flexicubes_weights_reg
+        loss = loss_mse + loss_lpips + loss_mask + loss_normal + loss_reg
+        prefix = 'train'
+        loss_dict = {}
+        loss_dict.update({f'{prefix}/loss_mse': loss_mse})
+        loss_dict.update({f'{prefix}/loss_lpips': loss_lpips})
+        loss_dict.update({f'{prefix}/loss_mask': loss_mask})
+        loss_dict.update({f'{prefix}/loss_normal': loss_normal})
+        loss_dict.update({f'{prefix}/loss_depth': loss_depth})
+        loss_dict.update({f'{prefix}/loss_reg_sdf': sdf_reg_loss_entropy})
+        loss_dict.update({f'{prefix}/loss_reg_surface': flexicubes_surface_reg})
+        loss_dict.update({f'{prefix}/loss_reg_weights': flexicubes_weights_reg})
+        loss_dict.update({f'{prefix}/loss': loss})
+        return loss, loss_dict
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        lrm_generator_input = self.prepare_validation_batch_data(batch)
+        render_out = self.forward(lrm_generator_input)
+        render_images = render_out['img']
+        render_images = rearrange(render_images, 'b n c h w -> b c h (n w)')
+        self.validation_step_outputs.append(render_images)
+    def on_validation_epoch_end(self):
+        images = torch.cat(self.validation_step_outputs, dim=-1)
+        all_images = self.all_gather(images)
+        all_images = rearrange(all_images, 'r b c h w -> (r b) c h w')
+        if self.global_rank == 0:
+            image_path = os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')
+            grid = make_grid(all_images, nrow=1, normalize=True, value_range=(0, 1))
+            save_image(grid, image_path)
+            print(f"Saved image to {image_path}")
+        self.validation_step_outputs.clear()
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        optimizer = torch.optim.AdamW(
+            self.lrm_generator.parameters(), lr=lr, betas=(0.90, 0.95), weight_decay=0.01)
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 100000, eta_min=0)
+        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

src/models/__init__.py ADDED Viewed

File without changes

src/models/decoder/__init__.py ADDED Viewed

File without changes

src/models/decoder/transformer.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) 2023, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class BasicTransformerBlock(nn.Module):
+    """
+    Transformer block that takes in a cross-attention condition and another modulation vector applied to sub-blocks.
+    """
+    # use attention from torch.nn.MultiHeadAttention
+    # Block contains a cross-attention layer, a self-attention layer, and a MLP
+    def __init__(
+        self,
+        inner_dim: int,
+        cond_dim: int,
+        num_heads: int,
+        eps: float,
+        attn_drop: float = 0.,
+        attn_bias: bool = False,
+        mlp_ratio: float = 4.,
+        mlp_drop: float = 0.,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm3 = nn.LayerNorm(inner_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x, cond):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond]
+        x = x + self.cross_attn(self.norm1(x), cond, cond)[0]
+        before_sa = self.norm2(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa)[0]
+        x = x + self.mlp(self.norm3(x))
+        return x
+class TriplaneTransformer(nn.Module):
+    """
+    Transformer with condition that generates a triplane representation.
+    Reference:
+    Timm: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L486
+    """
+    def __init__(
+        self,
+        inner_dim: int,
+        image_feat_dim: int,
+        triplane_low_res: int,
+        triplane_high_res: int,
+        triplane_dim: int,
+        num_layers: int,
+        num_heads: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        # attributes
+        self.triplane_low_res = triplane_low_res
+        self.triplane_high_res = triplane_high_res
+        self.triplane_dim = triplane_dim
+        # modules
+        # initialize pos_embed with 1/sqrt(dim) * N(0, 1)
+        self.pos_embed = nn.Parameter(torch.randn(1, 3*triplane_low_res**2, inner_dim) * (1. / inner_dim) ** 0.5)
+        self.layers = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim=inner_dim, cond_dim=image_feat_dim, num_heads=num_heads, eps=eps)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(inner_dim, eps=eps)
+        self.deconv = nn.ConvTranspose2d(inner_dim, triplane_dim, kernel_size=2, stride=2, padding=0)
+    def forward(self, image_feats):
+        # image_feats: [N, L_cond, D_cond]
+        N = image_feats.shape[0]
+        H = W = self.triplane_low_res
+        L = 3 * H * W
+        x = self.pos_embed.repeat(N, 1, 1)  # [N, L, D]
+        for layer in self.layers:
+            x = layer(x, image_feats)
+        x = self.norm(x)
+        # separate each plane and apply deconv
+        x = x.view(N, 3, H, W, -1)
+        x = torch.einsum('nihwd->indhw', x)  # [3, N, D, H, W]
+        x = x.contiguous().view(3*N, -1, H, W)  # [3*N, D, H, W]
+        x = self.deconv(x)  # [3*N, D', H', W']
+        x = x.view(3, N, *x.shape[-3:])  # [3, N, D', H', W']
+        x = torch.einsum('indhw->nidhw', x)  # [N, 3, D', H', W']
+        x = x.contiguous()
+        return x