Spaces:

yslan
/

LN3Diff_I23D

Running on Zero

App Files Files Community

NIRVANALAN commited on Aug 18

Commit

11e6f7b

•

1 Parent(s): 02553f6

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
LICENSE +10 -0
README.md +387 -10
__pycache__/gradio_app.cpython-310.pyc +0 -0
assets/ffhq_eval_pose.pt +3 -0
assets/i23d/single-img-cond/birthday-cake.gif +3 -0
assets/i23d/single-img-cond/chest.gif +3 -0
assets/i23d/single-img-cond/flower.gif +3 -0
assets/i23d/single-img-cond/genshin-house.gif +3 -0
assets/i23d/single-img-cond/robot.gif +3 -0
assets/i23d_examples/for_demo_inference/fox-input.png +0 -0
assets/i23d_examples/for_demo_inference/genshin_building-input.png +0 -0
assets/i23d_examples/for_demo_inference/sword-input.png +0 -0
assets/i23d_examples/for_demo_inference/teasure_chest-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/bulldog-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/cake-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/cute_tiger-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/extinguisher-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/genshin_teapot-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/house2-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/mushroom_teapot-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/plant-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/robot-input.png +0 -0
assets/i23d_examples/instant_mesh_samples/sorting_board-input.png +0 -0
assets/input_cameras.pt +3 -0
assets/objv_eval_pose.pt +3 -0
assets/render_cameras.pt +3 -0
assets/shapenet_eval_pose.pt +3 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000.json +47 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_albedo.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_hdr.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_mr.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_nd.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_ng.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001.json +47 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_albedo.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_hdr.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_mr.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_nd.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_ng.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002.json +47 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_albedo.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_hdr.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_mr.png +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_nd.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_ng.exr +0 -0
assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00003/00003.json +47 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ply filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,10 @@

+S-Lab License 1.0
+Copyright 2023 S-Lab
+Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+4. In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work.

README.md CHANGED Viewed

@@ -1,12 +1,389 @@
 ---
-title: LN3Diff I23D ECCV24
-emoji: 👀
-colorFrom: blue
-colorTo: red
-sdk: gradio
-sdk_version: 4.41.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<h1>
+LN3Diff: Scalable Latent Neural Fields Diffusion for Speedy 3D Generation (ECCV 2024)
+</h1>
+<div>
+    <a href='https://github.com/NIRVANALAN' target='_blank'>Yushi Lan</a><sup>1</sup>&emsp;
+    <a href='https://hongfz16.github.io' target='_blank'>Fangzhou Hong</a><sup>1</sup>&emsp;
+    <a href='https://williamyang1991.github.io/' target='_blank'>Shuai Yang</a><sup>2</sup>&emsp;
+    <a href='https://shangchenzhou.com/' target='_blank'>Shangchen Zhou</a><sup>1</sup>&emsp;
+    <a href='https://sg.linkedin.com/in/xuyi-meng-673779208' target='_blank'>Xuyi Meng</a><sup>1</sup>&emsp;
+    <br>
+    <a href='https://daibo.info/' target='_blank'>Bo Dai</a>
+    <sup>3</sup>
+    <a href='https://xingangpan.github.io/' target='_blank'>Xingang Pan</a>
+    <sup>1</sup>
+    <a href='https://www.mmlab-ntu.com/person/ccloy/' target='_blank'>Chen Change Loy</a>
+    <sup>1</sup> &emsp;
+</div>
+<div>
+    S-Lab, Nanyang Technological University<sup>1</sup>;
+    <!-- &emsp; -->
+    <br>
+    Wangxuan Institute of Computer Technology, Peking University<sup>2</sup>;
+    <br>
+    <!-- &emsp; -->
+    Shanghai Artificial Intelligence Laboratory <sup>3</sup>
+    <!-- <br>
+     <sup>*</sup>corresponding author -->
+</div>
+<div>
+<!-- <a target="_blank" href="https://colab.research.google.com/github/nirvanalan/E3DGE/blob/main/notebook/CVPR23_E3DGE_Demo.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a> -->
+<a href="https://hits.seeyoufarm.com"><img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FNIRVANALAN%2FLN3Diff&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false"/></a>
+</div>
+<br>
+<!-- <h4> -->
+<strong>
+LN3Diff (Latent Neural Fields 3D Diffusion) is a generic, feedforward 3D LDM framework that creates high-quality 3D object mesh from text within SECONDS.
+</strong>
+<!-- </h4> -->
+<!-- <table>
+<tr></tr>
+<tr>
+    <td>
+        <img src="assets/t23d/standing-hund.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/ufo.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/mast.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/cannon.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/blue-plastic-chair.gif">
+    </td>
+</tr>
+<tr>
+    <td align='center' width='20%'>A standing hund.</td>
+    <td align='center' width='20%'>An UFO space aircraft.</td>
+    <td align='center' width='20%'>A sailboat with mast.</td>
+    <td align='center' width='20%'>An 18th century cannon.</td>
+    <td align='center' width='20%'>A blue plastic chair.</td>
+</tr>
+<tr></tr>
+</table> -->
+<table>
+<tr></tr>
+<tr>
+    <td>
+        <img src="assets/t23d/dit-l2/the-eiffel-tower.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/dit-l2/stone-waterfall-with-wooden-shed.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/dit-l2/a-plate-of-sushi.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/dit-l2/wooden-chest-with-golden-trim.gif">
+    </td>
+    <td>
+        <img src="assets/t23d/dit-l2/a-blue-plastic-chair.gif">
+    </td>
+</tr>
+<tr>
+    <td align='center' width='20%'>The eiffel tower.</td>
+    <td align='center' width='20%'>A stone waterfall with wooden shed.</td>
+    <td align='center' width='20%'>A plate of sushi</td>
+    <td align='center' width='20%'>A wooden chest with golden trim</td>
+    <td align='center' width='20%'>A blue plastic chair.</td>
+</tr>
+<tr></tr>
+</table>
+<!-- <br> -->
+For more visual results, go checkout our <a href="https://nirvanalan.github.io/projects/ln3diff/" target="_blank">project page</a> :page_with_curl:
+<strike>
+Codes coming soon :facepunch:
+</strike>
+This repository contains the official implementation of LN3Diff:
+Scalable Latent Neural Fields Diffusion for Speedy 3D Generation
+</div>
 ---
+<h4 align="center">
+  <a href="https://nirvanalan.github.io/projects/ln3diff/" target='_blank'>[Project Page]</a>
+  •
+  <a href="https://arxiv.org/pdf/2403.12019.pdf" target='_blank'>[arXiv]</a>
+</h4>
+## :mega: Updates
+[08/2024] We have released the new 3D VAE trained on G-Objaverse full sets, and the corresponding DiT-based T23D and I23D model, trained with flow-matching. Please check the samples below.
+[06/2024] LN3Diff got accepted to ECCV 2024 :partying_face:!
+[04/2024] Inference and training codes on Objaverse, ShapeNet and FFHQ are released, including pre-trained model and training dataset.
+[03/2024] Initial code release.
+## :dromedary_camel: TODO
+- [ ] Add Gradio demo.
+- [x] Release the new I23D flow-matching-based DiT model trained with 180K G-Objaverse instances (Aug 2024).
+- [x] Release the new T23D DDPM-based DiT model trained with 180K G-Objaverse instances (Aug 2024).
+- [x] Release the new 3D VAE trained with 180K G-Objaverse instances (July 2024).
+- [x] Release DiT-based, flow-matching based 3D generation framework (July 2024).
+- [ ] Polish the dataset preparation and training doc.
+- [ ] add metrics evaluation scripts and samples.
+- [ ] Lint the code.
+- [x] Release the inference and training code (Apr 2024).
+- [x] Release the pre-trained checkpoints of ShapeNet and FFHQ (Apr 2024).
+- [x] Release the pre-trained checkpoints of T23D Objaverse model trained with 30K+ instances dataset (Apr 2024).
+- [x] Release the stage-1 VAE of Objaverse trained with 80K+ instances dataset (Apr 2024).
+## :handshake: Citation
+If you find our work useful for your research, please consider citing the paper:
+```
+@inproceedings{lan2024ln3diff,
+    title={LN3Diff: Scalable Latent Neural Fields Diffusion for Speedy 3D Generation},
+    author={Yushi Lan and Fangzhou Hong and Shuai Yang and Shangchen Zhou and Xuyi Meng and Bo Dai and Xingang Pan and Chen Change Loy},
+    year={2024},
+    booktitle={ECCV},
+}
+```
+## :desktop_computer: Requirements
+NVIDIA GPUs are required for this project.
+We conduct all the training on NVIDIA V100-32GiB (ShapeNet, FFHQ) and NVIDIA A100-80GiB (G-Objaverse).
+We have test the inference codes on NVIDIA V100.
+We recommend using anaconda to manage the python environments.
+The environment can be created via ```conda env create -f environment_ln3diff.yml```, and activated via ```conda activate ln3diff```.
+If you want to reuse your own PyTorch environment, install the following packages in your environment:
+```
+pip install -r requirements.txt
+```
+## :running_woman: Inference
+### Download Models
+The pretrained stage-1 VAE and stage-2 LDM can be downloaded via [OneDrive](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/EdOR7CbhyndFryaDnlexFqwBIr9XzFgdKXoLOOIagt7Ggw?e=ZzULnq).
+Put the downloaded checkpoints under ```checkpoints``` folder for inference. The checkpoints directory layout should be
+    checkpoints
+    ├── objaverse
+    │     ├── model_rec1890000.pt # DiT/L-based 3D VAE
+    │     └── objaverse-dit
+    │           └── t23d/model_joint_denoise_rec_model3820000.pt #
+    │           └── i23d/model_joint_denoise_rec_model2990000.pt #
+    ├── shapenet
+    │     └── car
+    │           └── model_joint_denoise_rec_model1580000.pt
+    │     └── chair
+    │           └── model_joint_denoise_rec_model2030000.pt
+    │     └── plane
+    │           └── model_joint_denoise_rec_model770000.pt
+    ├── ffhq
+    │     └── objaverse-vae/model_joint_denoise_rec_model1580000.pt
+    └── ...
+### Inference Commands
+<strong>Note that to extract the mesh, 24GiB VRAM is required.</strong>
+## (New) Inference: (Single) Image-to-3D
+We train a single-image-conditioned DiT-L/2 on the extracted VAE latents using [flow-matching](https://github.com/willisma/SiT]) framework, for more controllable 3D generation. To inference the results, please run
+```bash
+bash shell_scripts/final_release/inference/sample_obajverse_i23d_dit.sh
+```
+Which reconstructs the 3D assets given input images from ```assets/i23d_examples/for_demo_inference```. The input images are borrowed from [InstantMesh](https://github.com/TencentARC/InstantMesh). The model outputs are shown below (input in the next row.):
+<table>
+<tr></tr>
+<tr>
+    <td>
+        <img src="assets/i23d/single-img-cond/genshin-house.gif">
+    </td>
+    <td>
+        <img src="assets/i23d/single-img-cond/chest.gif">
+    </td>
+    <td>
+        <img src="assets/i23d/single-img-cond/flower.gif">
+    </td>
+    <td>
+        <img src="assets/i23d/single-img-cond/robot.gif">
+    </td>
+    <td>
+        <img src="assets/i23d/single-img-cond/birthday-cake.gif">
+    </td>
+    <!-- <td>
+        <img src="assets/i23d/single-img-cond/birthday-cake.gif">
+    </td> -->
+</tr>
+To run 3D reconstruction with your own data, just change the ```$eval_path``` in the above bash file. E.g., change it to ```eval_path=./assets/i23d_examples/instant_mesh_samples``` will do 3D reconstruction on more real images from InstantMesh.
+Also, tuning the cfg through ```$unconditional_guidance_scale``` will balance the generation fidelity and diversity.
+<tr>
+    <td align='center' width='20%'>
+        <img src="assets/i23d_examples/for_demo_inference/genshin_building-input.png">
+    </td>
+    <td align='center' width='20%'>
+        <img src="assets/i23d_examples/for_demo_inference/teasure_chest-input.png">
+    </td>
+    <td align='center' width='20%'>
+        <img src="assets/i23d_examples/instant_mesh_samples/plant-input.png">
+    </td>
+    <td align='center' width='20%'>
+        <img src="assets/i23d_examples/instant_mesh_samples/robot-input.png">
+    </td>
+    <td align='center' width='20%'>
+        <img src="assets/i23d_examples/instant_mesh_samples/cake-input.png">
+    </td>
+    <!-- <td align='center' width='20%'>
+        <img src="assets/i23d_examples/for_demo_inference/sword-input.png">
+    </td> -->
+</tr>
+<tr></tr>
+</table>
+We have uploaded the inference results on some common I23D images (from InstantMesh) to [onedrive](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/EeMmXvoJ0khNjzWLgMBEgMMB_xb9r9ciRkDgjnQCDJqLSg?e=rk9iGs), including the condition images, rendered images/videos and the corresponding extracted textured mesh (with 4 different seeds, and cfg=5.0). Feel free to use them for comparison in your own method.
+## Inference: Text-to-3D
+We train text-conditioned 3D latent diffusion model on top of the stage-1 extracted latents.
+For the following bash inference file, to extract textured mesh from the generated tri-plane, set ```--save_img True```. To change the text prompt, set the ```prompt``` variable. For unconditional sampling, set the cfg guidance ```unconditional_guidance_scale=0```. Feel free to tune the cfg guidance scale to trade off diversity and fidelity.
+Note that the diffusion sampling batch size is set to ```4```, which costs around 16GiB VRAM. The mesh extraction of a single instance costs 24GiB VRAM.
+### text-to-3D on Objaverse
+```bash
+bash shell_scripts/final_release/inference/sample_obajverse_t23d_dit.sh
+```
+which shall reproduce the results shown in the Fig.5 in our paper, using the same text prompts. The results may slightly differ due to random seed used, but the quality are the same. Some output samples are shown in the top figure.
+Note that the text prompts are directly hard-coded in the ```scripts/vit_triplane_diffusion_sample_objaverse.py```.
+### text-to-3D on ShapeNet
+For text-to-3D on ShapeNet, run one of the following commands (which conducts T23D on car, chair and plane.)
+```bash
+bash shell_scripts/final_release/inference/sample_shapenet_car_t23d.sh
+```
+```bash
+bash shell_scripts/final_release/inference/sample_shapenet_chair_t23d.sh
+```
+```bash
+bash shell_scripts/final_release/inference/sample_shapenet_plane_t23d.sh
+```
+The output samples for FID, COV/MMD calculation are uploaded [here](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/Euc7VaM3SH9EmaJuwC0dG9cBWgyLQY6gsiogGMO4NB-ebA?e=fGh2Rv), which shall reproduce the quantitative results in Tab. 1 in the paper.
+### text-to-3D on FFHQ
+For text-to-3D on FFHQ, run
+```bash
+bash shell_scripts/final_release/inference/sample_ffhq_t23d.sh
+```
+#### Stage-1 VAE 3D reconstruction
+For (Objaverse) stage-1 VAE 3D reconstruction and extract VAE latents for diffusion learning, please run
+```bash
+bash shell_scripts/final_release/inference/sample_obajverse.sh
+```
+which shall give the following result:
+![Alt Text](assets/stage1_vae_reconstruction/reconstruction_result/stage1-vae-reconstruction.gif)
+The marching-cube extracted mesh can be visualized with Blender/MeshLab:
+<img title="a title" alt="Mesh Visualization" src="./assets/stage1_vae_reconstruction/reconstruction_result/mesh-visualization.png">
+The above VAE input and reconstruction outputs can be found in the [assets/stage1_vae_reconstruction](./assets/stage1_vae_reconstruction) folder.
+**!! We upload the pre-extracted vae latents [here](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/Ef_7iMZRQT5Bl5YI0hHabQ0B_Y8INDDuaq78gOJaQSPiqg?e=Ef3rXK), which contains the correponding VAE latents (with shape 32x32x12) of 176K G-buffer Objaverse objects. Feel free to use them in your own task.**
+For more G-buffer Objaverse examples, download the [demo data](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/EoyzVJbMyBhLoKFJbbsq6bYBi1paLwQxIDjTkO1KjI4b1g?e=sJc3rQ).
+## :running_woman: Training
+### For training stage-1 VAE
+For Objaverse, we use the rendering provided by [G-buffer Objaverse](https://aigc3d.github.io/gobjaverse/).
+We process the data into multi-view chunks for faster loading, and the pre-processed data (176K instances) can be downloaded [here](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/EtOUTVZNtGxCg3aJnCDkqZcBmH_-OxCGLQBDwqq9ny5Ing?e=BUHiZi). Noted that you need 450 GiB storage to download the dataset.
+<!-- A demo subset for stage-1 VAE reconstruction can be downloaded from [here](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/Eb6LX2x-EgJLpiHbhRxsN9ABnEaSyjG-tsVBcUr_dQ5dnQ?e=JXWQo1). Note that for Objaverse training, we pre-process the raw data into [wds-dataset](https://github.com/webdataset/webdataset) shards for fast and flexible loading. The sample shard data can be found in [here](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/ErtZQgnEH5ZItDqdUaiVbJgBe4nhZveJemQRqDW6Xwp7Zg?e=Zqt6Ss). -->
+For ShapeNet, we render our own data with foreground mask for training, which can be downloaded from [here](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/EijBXIC_bUNOo0L3wnJKRqoBCqVnhhT_BReYRc1tc_0lrA?e=VQwWOZ). For training, we convert the raw data to LMDB for faster data loading. The pre-processed LMDB file can be downloaded from [here](https://entuedu-my.sharepoint.com/:f:/g/personal/yushi001_e_ntu_edu_sg/Ev7L8Als8K9JtLtj1G23Cc0BTNDbhCQPadxNLLVS7mV2FQ?e=C5woyE).
+For FFHQ, we use the pre-processed dataset from [EG3D](https://github.com/NVlabs/eg3d) and compress it into LMDB, which can also be found in the onedrive link above.
+### For training stage-2 LDM
+#### Pre-extracted latents
+We have uploaded the pre-extracted vae latents [here](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/Ef_7iMZRQT5Bl5YI0hHabQ0B_Y8INDDuaq78gOJaQSPiqg?e=Ef3rXK), which contains the correponding VAE latents (with shape 32x32x3x4) of 176K G-buffer Objaverse objects. Feel free to use them in the LDM training.
+#### text-to-3D
+The Cap3D captions can be downloaded from [here](https://entuedu-my.sharepoint.com/:u:/g/personal/yushi001_e_ntu_edu_sg/EdzVtlT_eUpItE73osqE1UEBSNmC2wfQ0YimmMcLcRhpqw?e=wMHtvx).
+Please put under ```'./datasets/text_captions_cap3d.json'```
+#### image-to-3D
+We directly use G-Objaverse rendering images for training, and you may need to download their data for this experiments.
+### Training Commands
+Coming soon.
+## More discussions of the proposed method
+Compared to existing 3D generation framework such as SDS-based ([DreamFusion](https://dreamfusion3d.github.io/)), mulit-view generation-based ([MVDream](https://arxiv.org/abs/2308.16512), [Zero123++](https://github.com/SUDO-AI-3D/zero123plus), [Instant3D](https://instant-3d.github.io/)) and feedforward 3D reconstruction-based ([LRM](https://yiconghong.me/LRM/), [InstantMesh](https://github.com/TencentARC/InstantMesh), [LGM](https://github.com/3DTopia/LGM)), LN3Diff is an origin 3D Diffusion framework.
+Like 2D/Video AIGC pipeline, LN3Diff first trains a 3D-VAE and then conduct LDM training (text/image conditioned) on the learned latent space. Some related methods from the industry ([Shape-E](https://github.com/openai/shap-e), [CLAY](https://github.com/CLAY-3D/OpenCLAY), [Meta 3D Gen](https://arxiv.org/abs/2303.05371)) also follow the same paradigm.
+Though currently the performance of the origin 3D LDM's works are overall inferior to reconstruction-based methods, we believe the proposed method has much potential and scales better with more data and compute resources, and may yield better 3D editing performance due to its compatability with diffusion model.
+## :newspaper_roll:  License
+Distributed under the NTU S-Lab License. See `LICENSE` for more information.
+## Contact
+If you have any question, please feel free to contact us via `lanyushi15@gmail.com` or Github issues.

__pycache__/gradio_app.cpython-310.pyc ADDED Viewed

Binary file (8.33 kB). View file

assets/ffhq_eval_pose.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d73c5a2925b957a64d4ac3a00dcb88cfab00bb476ab671b71f9cfcaf19cc977
+size 4352

assets/i23d/single-img-cond/birthday-cake.gif ADDED Viewed

Git LFS Details

SHA256: 139e28118a6f43bb0f1b6d098a2fffab13624cd59263b72aef808dc70a136ac1
Pointer size: 131 Bytes
Size of remote file: 567 kB

assets/i23d/single-img-cond/chest.gif ADDED Viewed

Git LFS Details

SHA256: e42f5131707b35048257ba4d02acbccfab4e2ac2d1d587491051e78021ae0694
Pointer size: 131 Bytes
Size of remote file: 591 kB

assets/i23d/single-img-cond/flower.gif ADDED Viewed

Git LFS Details

SHA256: c24c54cddcedee3291ec53b09ed9e79823c2e4b3561d000681b13671e726e8ea
Pointer size: 131 Bytes
Size of remote file: 435 kB

assets/i23d/single-img-cond/genshin-house.gif ADDED Viewed

Git LFS Details

SHA256: b16b6147d040acddcb6ecf5eea42144660108f219dd2ab75dffc296aab5338f7
Pointer size: 131 Bytes
Size of remote file: 605 kB

assets/i23d/single-img-cond/robot.gif ADDED Viewed

Git LFS Details

SHA256: bbca8524ab5f96cd6c53416c1a9e49f92d79407109a9c193fc8faecd79094d14
Pointer size: 131 Bytes
Size of remote file: 501 kB

assets/i23d_examples/for_demo_inference/fox-input.png ADDED Viewed

assets/i23d_examples/for_demo_inference/genshin_building-input.png ADDED Viewed

assets/i23d_examples/for_demo_inference/sword-input.png ADDED Viewed

assets/i23d_examples/for_demo_inference/teasure_chest-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/bulldog-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/cake-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/cute_tiger-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/extinguisher-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/genshin_teapot-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/house2-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/mushroom_teapot-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/plant-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/robot-input.png ADDED Viewed

assets/i23d_examples/instant_mesh_samples/sorting_board-input.png ADDED Viewed

assets/input_cameras.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ce0c9413275642ff034bb7b93bd4dd1afff084ef2cbfdb4dcaa4f87fda68191
+size 1594

assets/objv_eval_pose.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09b6e0eb22da1ca5d1e4a4fd1bfe5b08d65d1b1926621aa22601b67f20904f9a
+size 4721

assets/render_cameras.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c118a971e74bd10f18821c1254e3e44a3adb55f752b36ebbca3e726df91a4e91
+size 13183

assets/shapenet_eval_pose.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f53380bc2a91ed9beddc06dce68ec0e7c9a0be62770124b717b2ce086cbba8a6
+size 10764

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "max_depth": 5.0,
+    "bbox": [
+        [
+            -0.330194056,
+            -0.449999958,
+            -0.263895959
+        ],
+        [
+            0.330194056,
+            0.450000018,
+            0.263895959
+        ]
+    ],
+    "origin": [
+        1.64323258,
+        0.0,
+        0.315478027
+    ],
+    "x_fov": 0.691150367,
+    "y_fov": 0.691150367,
+    "x": [
+        1.2196297E-07,
+        1.00000012,
+        0.0
+    ],
+    "y": [
+        0.188542932,
+        0.0,
+        -0.982064962
+    ],
+    "z": [
+        -0.9820651,
+        1.2196297E-07,
+        -0.188542932
+    ],
+    "scale": [
+        0.0023696092,
+        0.0023696092,
+        0.0023696092
+    ],
+    "offset": [
+        0.0,
+        -0.4037283,
+        -0.06950388
+    ]
+}

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_albedo.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_hdr.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_mr.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_nd.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00000/00000_ng.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "max_depth": 5.0,
+    "bbox": [
+        [
+            -0.330194056,
+            -0.449999958,
+            -0.263895959
+        ],
+        [
+            0.330194056,
+            0.450000018,
+            0.263895959
+        ]
+    ],
+    "origin": [
+        1.58724082,
+        0.425299883,
+        0.315478027
+    ],
+    "x_fov": 0.691150367,
+    "y_fov": 0.691150367,
+    "x": [
+        -0.258818865,
+        0.9659259,
+        2.14746585E-08
+    ],
+    "y": [
+        0.18211852,
+        0.0487984978,
+        -0.982064962
+    ],
+    "z": [
+        -0.948601961,
+        -0.2541769,
+        -0.188542962
+    ],
+    "scale": [
+        0.0023696092,
+        0.0023696092,
+        0.0023696092
+    ],
+    "offset": [
+        0.0,
+        -0.4037283,
+        -0.06950388
+    ]
+}

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_albedo.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_hdr.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_mr.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_nd.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00001/00001_ng.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "max_depth": 5.0,
+    "bbox": [
+        [
+            -0.330194056,
+            -0.449999958,
+            -0.263895959
+        ],
+        [
+            0.330194056,
+            0.450000018,
+            0.263895959
+        ]
+    ],
+    "origin": [
+        1.42308116,
+        0.8216163,
+        0.315478027
+    ],
+    "x_fov": 0.691150367,
+    "y_fov": 0.691150367,
+    "x": [
+        -0.50000006,
+        0.8660254,
+        -6.586047E-09
+    ],
+    "y": [
+        0.163282961,
+        0.0942714661,
+        -0.982064962
+    ],
+    "z": [
+        -0.8504932,
+        -0.4910325,
+        -0.188542932
+    ],
+    "scale": [
+        0.0023696092,
+        0.0023696092,
+        0.0023696092
+    ],
+    "offset": [
+        0.0,
+        -0.4037283,
+        -0.06950388
+    ]
+}

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_albedo.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_hdr.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_mr.png ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_nd.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00002/00002_ng.exr ADDED Viewed

assets/stage1_vae_reconstruction/Objaverse/Animals/0/10120/campos_512_v4/00003/00003.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "max_depth": 5.0,
+    "bbox": [
+        [
+            -0.330194056,
+            -0.449999958,
+            -0.263895959
+        ],
+        [
+            0.330194056,
+            0.450000018,
+            0.263895959
+        ]
+    ],
+    "origin": [
+        1.16194093,
+        1.16194093,
+        0.315478027
+    ],
+    "x_fov": 0.691150367,
+    "y_fov": 0.691150367,
+    "x": [
+        -0.707106769,
+        0.707106769,
+        -5.59717162E-10
+    ],
+    "y": [
+        0.13332,
+        0.13332,
+        -0.982064962
+    ],
+    "z": [
+        -0.6944248,
+        -0.694424748,
+        -0.188542962
+    ],
+    "scale": [
+        0.0023696092,
+        0.0023696092,
+        0.0023696092
+    ],
+    "offset": [
+        0.0,
+        -0.4037283,
+        -0.06950388
+    ]
+}