camenduru commited on Oct 27, 2025

Commit

c77cbf6

verified ·

1 Parent(s): c68420f

thanks to haoningwu ❤

Browse files

Files changed (21) hide show

.gitattributes +1 -0
README.md +134 -0
assets/SceneGen.png +3 -0
assets/icon.png +0 -0
ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json +31 -0
ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors +3 -0
ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json +17 -0
ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors +3 -0
ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.json +18 -0
ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.safetensors +3 -0
ckpts/slat_enc_swin8_B_64l8_fp16.json +15 -0
ckpts/slat_enc_swin8_B_64l8_fp16.safetensors +3 -0
ckpts/slat_flow_img_dit_L_64l8p2_fp16.json +19 -0
ckpts/slat_flow_img_dit_L_64l8p2_fp16.safetensors +3 -0
ckpts/ss_dec_conv3d_16l8_fp16.json +12 -0
ckpts/ss_dec_conv3d_16l8_fp16.safetensors +3 -0
ckpts/ss_enc_conv3d_16l8_fp16.json +12 -0
ckpts/ss_enc_conv3d_16l8_fp16.safetensors +3 -0
ckpts/ss_scenegen_flow_img_dit_L_16l8_fp16.json +21 -0
ckpts/ss_scenegen_flow_img_dit_L_16l8_fp16.pt +3 -0
pipeline.json +61 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/SceneGen.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,134 @@

+---
+pipeline_tag: image-to-3d
+license: mit
+language:
+- en
+---
+# SceneGen: Single-Image 3D Scene Generation in One Feedforward Pass
+This repository contains the official PyTorch implementation of SceneGen: https://arxiv.org/abs/2508.15769/. Feel free to reach out for discussions!
+**Now the Inference Code and Pretrained Models are released!**
+<div align="center">
+   <img src="./assets/SceneGen.png">
+</div>
+## 🌟 Some Information
+[Project Page](https://mengmouxu.github.io/SceneGen/) · [Paper](https://arxiv.org/abs/2508.15769/) · [Checkpoints](https://huggingface.co/haoningwu/SceneGen/)
+## ⏩ News
+- [2025.8] The inference code and checkpoints are released.
+- [2025.8] Our pre-print paper has been released on arXiv.
+## 📦 Installation & Pretrained Models
+### Prerequisites
+- **Hardware**: An NVIDIA GPU with at least 16GB of memory is necessary. The code has been verified on NVIDIA A100 and RTX 3090 GPUs.
+- **Software**:
+  - The [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) is needed to compile certain submodules. The code has been tested with CUDA versions 12.1.
+  - Python version 3.8 or higher is required.
+### Installation Steps
+1. Clone the repo:
+    ```sh
+    git clone https://github.com/Mengmouxu/SceneGen.git
+    cd SceneGen
+    ```
+2. Install the dependencies:
+    Create a new conda environment named `scenegen` and install the dependencies:
+    ```sh
+    . ./setup.sh --new-env --basic --xformers --flash-attn --diffoctreerast --spconv --mipgaussian --kaolin --nvdiffrast --demo
+    ```
+    The detailed usage of `setup.sh` can be found by running `. ./setup.sh --help`.
+### Pretrained Models
+1. First, create a directory in the SceneGen folder to store the checkpoints:
+    ```sh
+    mkdir -p checkpoints
+    ```
+2. Download the pretrained models for **SAM2-Hiera-Large** and **VGGT-1B** from [SAM2](https://huggingface.co/facebook/sam2-hiera-large/) and [VGGT](https://huggingface.co/facebook/VGGT-1B/), then place them in the `checkpoints` directory. (**SAM2** installation and its checkpoints are required for interactive generation with segmentation.)
+3. Download our pretrained SceneGen model from [here](https://huggingface.co/haoningwu/SceneGen/) and place it in the `checkpoints` directory as follows:
+    ```
+    SceneGen/
+    ├── checkpoints/
+    │   ├── sam2-hiera-large
+    │   ├── VGGT-1B
+    │   └── scenegen
+    |       ├──ckpts
+    |       └──pipeline.json
+    └── ...
+    ```
+## 💡 Inference
+We provide two scripts for inference: `inference.py` for batch processing and `interactive_demo.py` for an interactive Gradio demo.
+### Interactive Demo
+This script launches a Gradio web interface for interactive scene generation.
+- **Features**: It uses SAM2 for interactive image segmentation, allows for adjusting various generation parameters, and supports scene generation from single or multiple images.
+- **Usage**:
+  ```sh
+  python interactive_demo.py
+  ```
+  > ## 🚀 Quick Start Guide
+  >
+  > ### 📷 Step 1: Input & Segment
+  > 1.  **Upload your scene image.**
+  > 2.  **Use the mouse to draw bounding boxes** around objects.
+  > 3.  Click **"Run Segmentation"** to segment objects.
+  > > *※ For multi-image generation: maintain consistent object annotation order across all images.*
+  >
+  > ### 🗃️ Step 2: Manage Cache
+  > 1.  Click **"Add to Cache"** when satisfied with the segmentation.
+  > 2.  Repeat Step 1-2 for multiple images.
+  > 3.  Use **"Delete Selected"** or **"Clear All"** to manage cached images.
+  >
+  > ### 🎮 Step 3: Generate Scene
+  > 1.  Adjust generation parameters (optional).
+  > 2.  Click **"Generate 3D Scene"**.
+  > 3.  Download the generated GLB file when ready.
+  >
+  > **💡 Pro Tip:**  Try the examples below to get started quickly!
+### Pre-segmented Image Inference
+This script processes a directory of pre-segmented images.
+- **Input**: The input folder structure should be similar to `assets/masked_image_test`, containing segmented scene images.
+- **Visualization**: For scenes with ground truth data, you can use the `--gradio` flag to launch a Gradio interface that visualizes both the ground truth and the generated model. We provide data from the 3D-FUTURE test set as a demonstration.
+- **Usage**:
+  ```sh
+  python inference.py --gradio
+  ```
+## 📚 Dataset
+To be updated soon...
+## 🏋️‍♂️ Training
+To be updated soon...
+## Evaluation
+To be updated soon...
+## 📜 Citation
+If you use this code and data for your research or project, please cite:
+    @article{meng2025scenegen,
+      author    = {Meng, Yanxu and Wu, Haoning and Zhang, Ya and Xie, Weidi},
+      title     = {SceneGen: Single-Image 3D Scene Generation in One Feedforward Pass},
+      journal   = {arXiv preprint arXiv:2508.15769},
+      year      = {2025},
+    }
+## TODO
+- [x] Release Paper
+- [x] Release Checkpoints & Inference Code
+- [ ] Release Training Code
+- [ ] Release Evaluation Code
+- [ ] Release Data Processing Code
+## Acknowledgements
+Many thanks to the code bases from [TRELLIS](https://github.com/microsoft/TRELLIS), [DINOv2](https://github.com/facebookresearch/dinov2), and [VGGT](https://github.com/facebookresearch/vggt).
+## Contact
+If you have any questions, please feel free to contact [meng-mou-xu@sjtu.edu.cn](mailto:meng-mou-xu@sjtu.edu.cn) and [haoningwu3639@gmail.com](mailto:haoningwu3639@gmail.com).

assets/SceneGen.png ADDED Viewed

Git LFS Details

SHA256: 2fd57a3df30a2a484dfce27ca9b65b2d4b819114b754a79486ba94026f8df1ce
Pointer size: 131 Bytes
Size of remote file: 144 kB

assets/icon.png ADDED Viewed

ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "name": "SLatGaussianDecoder",
+    "args": {
+        "resolution": 64,
+        "model_channels": 768,
+        "latent_channels": 8,
+        "num_blocks": 12,
+        "num_heads": 12,
+        "mlp_ratio": 4,
+        "attn_mode": "swin",
+        "window_size": 8,
+        "use_fp16": true,
+        "representation_config": {
+            "lr": {
+                "_xyz": 1.0,
+                "_features_dc": 1.0,
+                "_opacity": 1.0,
+                "_scaling": 1.0,
+                "_rotation": 0.1
+            },
+            "perturb_offset": true,
+            "voxel_size": 1.5,
+            "num_gaussians": 32,
+            "2d_filter_kernel_size": 0.1,
+            "3d_filter_kernel_size": 9e-4,
+            "scaling_bias": 4e-3,
+            "opacity_bias": 0.1,
+            "scaling_activation": "softplus"
+        }
+    }
+}

ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38c84bcef5ce0af1f48b1b5558dabc7575a13346043c41a7e0610f1fa619a161
+size 171450952

ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "name": "SLatMeshDecoder",
+    "args": {
+        "resolution": 64,
+        "model_channels": 768,
+        "latent_channels": 8,
+        "num_blocks": 12,
+        "num_heads": 12,
+        "mlp_ratio": 4,
+        "attn_mode": "swin",
+        "window_size": 8,
+        "use_fp16": true,
+        "representation_config": {
+            "use_color": true
+        }
+    }
+}

ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859
+size 181903412

ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "name": "SLatRadianceFieldDecoder",
+    "args": {
+        "resolution": 64,
+        "model_channels": 768,
+        "latent_channels": 8,
+        "num_blocks": 12,
+        "num_heads": 12,
+        "mlp_ratio": 4,
+        "attn_mode": "swin",
+        "window_size": 8,
+        "use_fp16": true,
+        "representation_config": {
+            "rank": 16,
+            "dim": 8
+        }
+    }
+}

ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:737da6578d01948016b7c39786113af0d64a46f7922f6b8b5e698b84643be514
+size 171450488

ckpts/slat_enc_swin8_B_64l8_fp16.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "name": "SLatEncoder",
+    "args": {
+        "resolution": 64,
+        "in_channels": 1024,
+        "model_channels": 768,
+        "latent_channels": 8,
+        "num_blocks": 12,
+        "num_heads": 12,
+        "mlp_ratio": 4,
+        "attn_mode": "swin",
+        "window_size": 8,
+        "use_fp16": true
+    }
+}

ckpts/slat_enc_swin8_B_64l8_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21dceac6bee917ab6458ff52c9757ba89a779d03031c7bd17f9e7f0103bfd436
+size 173242816

ckpts/slat_flow_img_dit_L_64l8p2_fp16.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "name": "SLatFlowModel",
+    "args": {
+        "resolution": 64,
+        "in_channels": 8,
+        "out_channels": 8,
+        "model_channels": 1024,
+        "cond_channels": 1024,
+        "num_blocks": 24,
+        "num_heads": 16,
+        "mlp_ratio": 4,
+        "patch_size": 2,
+        "num_io_res_blocks": 2,
+        "io_block_channels": [128],
+        "pe_mode": "ape",
+        "qk_rms_norm": true,
+        "use_fp16": true
+    }
+}

ckpts/slat_flow_img_dit_L_64l8p2_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:693fb2a58ad497bd222007301eeec49d14d60f8c12d2f2f00c221fa747b4c66c
+size 1203755136

ckpts/ss_dec_conv3d_16l8_fp16.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "name": "SparseStructureDecoder",
+    "args": {
+        "out_channels": 1,
+        "latent_channels": 8,
+        "num_res_blocks": 2,
+        "num_res_blocks_middle": 2,
+        "channels": [512, 128, 32],
+        "use_fp16": true
+    }
+}

ckpts/ss_dec_conv3d_16l8_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a
+size 147591972

ckpts/ss_enc_conv3d_16l8_fp16.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "name": "SparseStructureEncoder",
+    "args": {
+        "in_channels": 1,
+        "latent_channels": 8,
+        "num_res_blocks": 2,
+        "num_res_blocks_middle": 2,
+        "channels": [32, 128, 512],
+        "use_fp16": true
+    }
+}

ckpts/ss_enc_conv3d_16l8_fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:107874eeaa0feb82f51b19db5da7db534fb7e7f19e5a122b9ff1bc2e258bfc6d
+size 119068016

ckpts/ss_scenegen_flow_img_dit_L_16l8_fp16.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "name": "SparseStructureFlowModel",
+    "args": {
+        "resolution": 16,
+        "in_channels": 8,
+        "out_channels": 8,
+        "model_channels": 1024,
+        "cond_channels": 1024,
+        "num_blocks": 24,
+        "num_heads": 16,
+        "mlp_ratio": 4,
+        "patch_size": 1,
+        "pe_mode": "ape",
+        "qk_rms_norm": true,
+	"use_fp16":true,
+        "use_global": true,
+        "trunk_depth": 4,
+        "num_iteration": 4,
+	"use_batch_encoder":false
+    }
+}

ckpts/ss_scenegen_flow_img_dit_L_16l8_fp16.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0aa2be2fcf950c68da708be5b54fbe5289628e5d2aa425de41446d87c8c1936
+size 2544030704

pipeline.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+    "name": "SceneGenImageToScenePipeline",
+    "args": {
+        "models": {
+            "sparse_structure_decoder": "ckpts/ss_dec_conv3d_16l8_fp16",
+            "sparse_structure_flow_model": "ckpts/ss_scenegen_flow_img_dit_L_16l8_fp16",
+            "slat_decoder_gs": "ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16",
+            "slat_decoder_rf": "ckpts/slat_dec_rf_swin8_B_64l8r16_fp16",
+            "slat_decoder_mesh": "ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16",
+            "slat_flow_model": "ckpts/slat_flow_img_dit_L_64l8p2_fp16"
+        },
+        "sparse_structure_sampler": {
+            "name": "FlowEulerGuidanceIntervalSamplerVGGT",
+            "args": {
+                "sigma_min": 1e-5
+            },
+            "params": {
+                "steps": 25,
+                "cfg_strength": 5.0,
+                "cfg_interval": [0.5, 1.0],
+                "rescale_t": 3.0
+            }
+        },
+        "slat_sampler": {
+            "name": "FlowEulerGuidanceIntervalSampler",
+            "args": {
+                "sigma_min": 1e-5
+            },
+            "params": {
+                "steps": 25,
+                "cfg_strength": 5.0,
+                "cfg_interval": [0.5, 1.0],
+                "rescale_t": 3.0
+            }
+        },
+        "slat_normalization": {
+            "mean": [
+                -2.1687545776367188,
+                -0.004347046371549368,
+                -0.13352349400520325,
+                -0.08418072760105133,
+                -0.5271206498146057,
+                0.7238689064979553,
+                -1.1414450407028198,
+                1.2039363384246826
+            ],
+            "std": [
+                2.377650737762451,
+                2.386378288269043,
+                2.124418020248413,
+                2.1748552322387695,
+                2.663944721221924,
+                2.371192216873169,
+                2.6217446327209473,
+                2.684523105621338
+            ]
+        },
+        "image_cond_model": "dinov2_vitl14_reg",
+        "vggt_model": "checkpoints/VGGT-1B"
+    }
+}