StableDiffusionVideoTo3D

Runtime error

App Files Files Community

heheyas commited on Mar 13, 2024

Commit

cfb7702

1 Parent(s): f5c8d4d

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +50 -0
configs/ae/video.yaml +35 -0
configs/embedder/clip_image.yaml +8 -0
configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml +104 -0
configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml +105 -0
configs/example_training/imagenet-f8_cond.yaml +185 -0
configs/example_training/toy/cifar10_cond.yaml +98 -0
configs/example_training/toy/mnist.yaml +79 -0
configs/example_training/toy/mnist_cond.yaml +98 -0
configs/example_training/toy/mnist_cond_discrete_eps.yaml +103 -0
configs/example_training/toy/mnist_cond_l1_loss.yaml +99 -0
configs/example_training/toy/mnist_cond_with_ema.yaml +100 -0
configs/example_training/txt2img-clipl-legacy-ucg-training.yaml +182 -0
configs/example_training/txt2img-clipl.yaml +184 -0
configs/inference/sd_2_1.yaml +60 -0
configs/inference/sd_2_1_768.yaml +60 -0
configs/inference/sd_xl_base.yaml +93 -0
configs/inference/sd_xl_refiner.yaml +86 -0
configs/inference/svd.yaml +131 -0
configs/inference/svd_image_decoder.yaml +114 -0
configs/inference/svd_mv.yaml +202 -0
mesh_recon/configs/neuralangelo-ortho-wmask.yaml +145 -0
mesh_recon/configs/v3d.yaml +144 -0
mesh_recon/configs/videonvs.yaml +144 -0
mesh_recon/datasets/__init__.py +17 -0
mesh_recon/datasets/blender.py +143 -0
mesh_recon/datasets/colmap.py +332 -0
mesh_recon/datasets/colmap_utils.py +295 -0
mesh_recon/datasets/dtu.py +201 -0
mesh_recon/datasets/fixed_poses/000_back_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_back_left_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_back_right_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_front_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_front_left_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_front_right_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_left_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_right_RT.txt +3 -0
mesh_recon/datasets/fixed_poses/000_top_RT.txt +3 -0
mesh_recon/datasets/ortho.py +287 -0
mesh_recon/datasets/utils.py +0 -0
mesh_recon/datasets/v3d.py +284 -0
mesh_recon/datasets/videonvs.py +256 -0
mesh_recon/datasets/videonvs_co3d.py +252 -0
mesh_recon/launch.py +144 -0
mesh_recon/mesh.py +845 -0
mesh_recon/models/__init__.py +16 -0
mesh_recon/models/base.py +32 -0
mesh_recon/models/geometry.py +238 -0
mesh_recon/models/nerf.py +161 -0
mesh_recon/models/network_utils.py +215 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,50 @@

+# extensions
+*.egg-info
+*.py[cod]
+# envs
+.pt13
+.pt2
+# directories
+/checkpoints
+/dist
+/outputs
+/build
+/src
+logs/
+ckpts/
+tmp/
+lightning_logs/
+images/
+images*/
+kb_configs/
+debug_lvis.log
+*.log
+.cache/
+redirects/
+submits/
+extern/
+assets/images
+output/
+assets/scene
+assets/GSO
+assets/SD
+spirals
+*.zip
+paper/
+spirals_co3d/
+scene_spirals/
+blenders/
+colmap_results/
+depth_spirals/
+recon/SIBR_viewers/
+recon/assets/
+mesh_recon/exp
+mesh_recon/runs
+mesh_recon/renders
+mesh_recon/refined
+*.png
+*.pdf
+*.npz
+*.npy

configs/ae/video.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+target: sgm.models.autoencoder.AutoencodingEngine
+params:
+  loss_config:
+    target: torch.nn.Identity
+  regularizer_config:
+    target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+  encoder_config:
+    target: sgm.modules.diffusionmodules.model.Encoder
+    params:
+      attn_type: vanilla
+      double_z: True
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [1, 2, 4, 4]
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+  decoder_config:
+    target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+    params:
+      attn_type: vanilla
+      double_z: True
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [1, 2, 4, 4]
+      num_res_blocks: 2
+      attn_resolutions: []
+      dropout: 0.0
+      video_kernel_size: [3, 1, 1]

configs/embedder/clip_image.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+params:
+  n_cond_frames: 1
+  n_copies: 1
+  open_clip_embedding_config:
+    target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+    params:
+      freeze: True

configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+model:
+  base_learning_rate: 4.5e-6
+  target: sgm.models.autoencoder.AutoencodingEngine
+  params:
+    input_key: jpg
+    monitor: val/rec_loss
+    loss_config:
+      target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
+      params:
+        perceptual_weight: 0.25
+        disc_start: 20001
+        disc_weight: 0.5
+        learn_logvar: True
+        regularization_weights:
+          kl_loss: 1.0
+    regularizer_config:
+      target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+    encoder_config:
+      target: sgm.modules.diffusionmodules.model.Encoder
+      params:
+        attn_type: none
+        double_z: True
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult: [1, 2, 4]
+        num_res_blocks: 4
+        attn_resolutions: []
+        dropout: 0.0
+    decoder_config:
+      target: sgm.modules.diffusionmodules.model.Decoder
+      params: ${model.params.encoder_config.params}
+data:
+  target: sgm.data.dataset.StableDataModuleFromConfig
+  params:
+    train:
+      datapipeline:
+        urls:
+          - DATA-PATH
+        pipeline_config:
+          shardshuffle: 10000
+          sample_shuffle: 10000
+        decoders:
+          - pil
+        postprocessors:
+          - target: sdata.mappers.TorchVisionImageTransforms
+            params:
+              key: jpg
+              transforms:
+                - target: torchvision.transforms.Resize
+                  params:
+                    size: 256
+                    interpolation: 3
+                - target: torchvision.transforms.ToTensor
+          - target: sdata.mappers.Rescaler
+          - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
+            params:
+              h_key: height
+              w_key: width
+      loader:
+        batch_size: 8
+        num_workers: 4
+lightning:
+  strategy:
+    target: pytorch_lightning.strategies.DDPStrategy
+    params:
+      find_unused_parameters: True
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 50000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        enable_autocast: False
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+  trainer:
+    devices: 0,
+    limit_val_batches: 50
+    benchmark: True
+    accumulate_grad_batches: 1
+    val_check_interval: 10000

configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+model:
+  base_learning_rate: 4.5e-6
+  target: sgm.models.autoencoder.AutoencodingEngine
+  params:
+    input_key: jpg
+    monitor: val/loss/rec
+    disc_start_iter: 0
+    encoder_config:
+      target: sgm.modules.diffusionmodules.model.Encoder
+      params:
+        attn_type: vanilla-xformers
+        double_z: true
+        z_channels: 8
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult: [1, 2, 4, 4]
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
+    decoder_config:
+      target: sgm.modules.diffusionmodules.model.Decoder
+      params: ${model.params.encoder_config.params}
+    regularizer_config:
+      target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+    loss_config:
+      target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
+      params:
+        perceptual_weight: 0.25
+        disc_start: 20001
+        disc_weight: 0.5
+        learn_logvar: True
+        regularization_weights:
+          kl_loss: 1.0
+data:
+  target: sgm.data.dataset.StableDataModuleFromConfig
+  params:
+    train:
+      datapipeline:
+        urls:
+          - DATA-PATH
+        pipeline_config:
+          shardshuffle: 10000
+          sample_shuffle: 10000
+        decoders:
+          - pil
+        postprocessors:
+          - target: sdata.mappers.TorchVisionImageTransforms
+            params:
+              key: jpg
+              transforms:
+                - target: torchvision.transforms.Resize
+                  params:
+                    size: 256
+                    interpolation: 3
+                - target: torchvision.transforms.ToTensor
+          - target: sdata.mappers.Rescaler
+          - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
+            params:
+              h_key: height
+              w_key: width
+      loader:
+        batch_size: 8
+        num_workers: 4
+lightning:
+  strategy:
+    target: pytorch_lightning.strategies.DDPStrategy
+    params:
+      find_unused_parameters: True
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 50000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        enable_autocast: False
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+  trainer:
+    devices: 0,
+    limit_val_batches: 50
+    benchmark: True
+    accumulate_grad_batches: 1
+    val_check_interval: 10000

configs/example_training/imagenet-f8_cond.yaml ADDED Viewed

	@@ -0,0 +1,185 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+    log_keys:
+      - cls
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [10000]
+        cycle_lengths: [10000000000000]
+        f_start: [1.e-6]
+        f_max: [1.]
+        f_min: [1.]
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 256
+        attention_resolutions: [1, 2, 4]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4]
+        num_head_channels: 64
+        num_classes: sequential
+        adm_in_channels: 1024
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              add_sequence_dim: True
+              embed_dim: 1024
+              n_classes: 1000
+          - is_trainable: False
+            ucg_rate: 0.2
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        ckpt_path: CKPT_PATH
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+          params:
+            num_idx: 1000
+            discretization_config:
+              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 5.0
+data:
+  target: sgm.data.dataset.StableDataModuleFromConfig
+  params:
+    train:
+      datapipeline:
+        urls:
+          # USER: adapt this path the root of your custom dataset
+          - DATA_PATH
+        pipeline_config:
+          shardshuffle: 10000
+          sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
+        decoders:
+          - pil
+        postprocessors:
+          - target: sdata.mappers.TorchVisionImageTransforms
+            params:
+              key: jpg # USER: you might wanna adapt this for your custom dataset
+              transforms:
+                - target: torchvision.transforms.Resize
+                  params:
+                    size: 256
+                    interpolation: 3
+                - target: torchvision.transforms.ToTensor
+          - target: sdata.mappers.Rescaler
+          - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
+            params:
+              h_key: height # USER: you might wanna adapt this for your custom dataset
+              w_key: width # USER: you might wanna adapt this for your custom dataset
+      loader:
+        batch_size: 64
+        num_workers: 6
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 8
+          n_rows: 2
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 1000

configs/example_training/toy/cifar10_cond.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+          params:
+            sigma_data: 1.0
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 3
+        out_channels: 3
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+        num_classes: sequential
+        adm_in_channels: 128
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              embed_dim: 128
+              n_classes: 10
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 3.0
+data:
+  target: sgm.data.cifar10.CIFAR10Loader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 64
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 64
+          n_rows: 8
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 20

configs/example_training/toy/mnist.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+          params:
+            sigma_data: 1.0
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 1
+        out_channels: 1
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+data:
+  target: sgm.data.mnist.MNISTLoader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 64
+        increase_log_steps: False
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 64
+          n_rows: 8
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 10

configs/example_training/toy/mnist_cond.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+          params:
+            sigma_data: 1.0
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 1
+        out_channels: 1
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+        num_classes: sequential
+        adm_in_channels: 128
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              embed_dim: 128
+              n_classes: 10
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 3.0
+data:
+  target: sgm.data.mnist.MNISTLoader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 16
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 16
+          n_rows: 4
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 20

configs/example_training/toy/mnist_cond_discrete_eps.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 1
+        out_channels: 1
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+        num_classes: sequential
+        adm_in_channels: 128
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              embed_dim: 128
+              n_classes: 10
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+          params:
+            num_idx: 1000
+            discretization_config:
+              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 5.0
+data:
+  target: sgm.data.mnist.MNISTLoader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 16
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 16
+          n_rows: 4
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 20

configs/example_training/toy/mnist_cond_l1_loss.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+          params:
+            sigma_data: 1.0
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 1
+        out_channels: 1
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+        num_classes: sequential
+        adm_in_channels: 128
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              embed_dim: 128
+              n_classes: 10
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_type: l1
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 3.0
+data:
+  target: sgm.data.mnist.MNISTLoader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 64
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 64
+          n_rows: 8
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 20

configs/example_training/toy/mnist_cond_with_ema.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    use_ema: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
+          params:
+            sigma_data: 1.0
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        in_channels: 1
+        out_channels: 1
+        model_channels: 32
+        attention_resolutions: []
+        num_res_blocks: 4
+        channel_mult: [1, 2, 2]
+        num_head_channels: 32
+        num_classes: sequential
+        adm_in_channels: 128
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: cls
+            ucg_rate: 0.2
+            target: sgm.modules.encoders.modules.ClassEmbedder
+            params:
+              embed_dim: 128
+              n_classes: 10
+    first_stage_config:
+      target: sgm.models.autoencoder.IdentityFirstStage
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 3.0
+data:
+  target: sgm.data.mnist.MNISTLoader
+  params:
+    batch_size: 512
+    num_workers: 1
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        batch_frequency: 1000
+        max_images: 64
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 64
+          n_rows: 8
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 20

configs/example_training/txt2img-clipl-legacy-ucg-training.yaml ADDED Viewed

	@@ -0,0 +1,182 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+    log_keys:
+      - txt
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [10000]
+        cycle_lengths: [10000000000000]
+        f_start: [1.e-6]
+        f_max: [1.]
+        f_min: [1.]
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [1, 2, 4]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        num_classes: sequential
+        adm_in_channels: 1792
+        num_heads: 1
+        transformer_depth: 1
+        context_dim: 768
+        spatial_transformer_attn_type: softmax-xformers
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: txt
+            ucg_rate: 0.1
+            legacy_ucg_value: ""
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              always_return_pooled: True
+          - is_trainable: False
+            ucg_rate: 0.1
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            ucg_rate: 0.1
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        ckpt_path: CKPT_PATH
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [ 1, 2, 4, 4 ]
+          num_res_blocks: 2
+          attn_resolutions: [ ]
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+          params:
+            num_idx: 1000
+            discretization_config:
+              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 7.5
+data:
+  target: sgm.data.dataset.StableDataModuleFromConfig
+  params:
+    train:
+      datapipeline:
+        urls:
+          # USER: adapt this path the root of your custom dataset
+          - DATA_PATH
+        pipeline_config:
+          shardshuffle: 10000
+          sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
+        decoders:
+          - pil
+        postprocessors:
+          - target: sdata.mappers.TorchVisionImageTransforms
+            params:
+              key: jpg # USER: you might wanna adapt this for your custom dataset
+              transforms:
+                - target: torchvision.transforms.Resize
+                  params:
+                    size: 256
+                    interpolation: 3
+                - target: torchvision.transforms.ToTensor
+          - target: sdata.mappers.Rescaler
+          - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
+            # USER: you might wanna use non-default parameters due to your custom dataset
+      loader:
+        batch_size: 64
+        num_workers: 6
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 8
+          n_rows: 2
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 1000

configs/example_training/txt2img-clipl.yaml ADDED Viewed

	@@ -0,0 +1,184 @@

+model:
+  base_learning_rate: 1.0e-4
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+    log_keys:
+      - txt
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [10000]
+        cycle_lengths: [10000000000000]
+        f_start: [1.e-6]
+        f_max: [1.]
+        f_min: [1.]
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [1, 2, 4]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        num_classes: sequential
+        adm_in_channels: 1792
+        num_heads: 1
+        transformer_depth: 1
+        context_dim: 768
+        spatial_transformer_attn_type: softmax-xformers
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: True
+            input_key: txt
+            ucg_rate: 0.1
+            legacy_ucg_value: ""
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              always_return_pooled: True
+          - is_trainable: False
+            ucg_rate: 0.1
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            ucg_rate: 0.1
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        ckpt_path: CKPT_PATH
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+          params:
+            num_idx: 1000
+            discretization_config:
+              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 50
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.VanillaCFG
+          params:
+            scale: 7.5
+data:
+  target: sgm.data.dataset.StableDataModuleFromConfig
+  params:
+    train:
+      datapipeline:
+        urls:
+          # USER: adapt this path the root of your custom dataset
+          - DATA_PATH
+        pipeline_config:
+          shardshuffle: 10000
+          sample_shuffle: 10000
+        decoders:
+          - pil
+        postprocessors:
+          - target: sdata.mappers.TorchVisionImageTransforms
+            params:
+              key: jpg # USER: you might wanna adapt this for your custom dataset
+              transforms:
+                - target: torchvision.transforms.Resize
+                  params:
+                    size: 256
+                    interpolation: 3
+                - target: torchvision.transforms.ToTensor
+          - target: sdata.mappers.Rescaler
+            # USER: you might wanna use non-default parameters due to your custom dataset
+          - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
+            # USER: you might wanna use non-default parameters due to your custom dataset
+      loader:
+        batch_size: 64
+        num_workers: 6
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+    image_logger:
+      target: main.ImageLogger
+      params:
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+        log_first_step: False
+        log_images_kwargs:
+          use_ema_scope: False
+          N: 8
+          n_rows: 2
+  trainer:
+    devices: 0,
+    benchmark: True
+    num_sanity_val_steps: 0
+    accumulate_grad_batches: 1
+    max_epochs: 1000

configs/inference/sd_2_1.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+            params:
+              freeze: true
+              layer: penultimate
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity

configs/inference/sd_2_1_768.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+            params:
+              freeze: true
+              layer: penultimate
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity

configs/inference/sd_xl_base.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2816
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: [1, 2, 10]
+        context_dim: 2048
+        spatial_transformer_attn_type: softmax-xformers
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              layer: hidden
+              layer_idx: 11
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+              legacy: False
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: target_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity

configs/inference/sd_xl_refiner.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2560
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 4
+        context_dim: [1280, 1280, 1280, 1280]
+        spatial_transformer_attn_type: softmax-xformers
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              legacy: False
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+          - is_trainable: False
+            input_key: aesthetic_score
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity

configs/inference/svd.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]

configs/inference/svd_image_decoder.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+        - input_key: fps_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: cond_aug
+          is_trainable: False
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: True
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity

configs/inference/svd_mv.yaml ADDED Viewed

	@@ -0,0 +1,202 @@

+model:
+  base_learning_rate: 1.0e-05
+  target: sgm.models.video_diffusion.DiffusionEngine
+  params:
+    ckpt_path: ckpts/svd_xt.safetensors
+    scale_factor: 0.18215
+    disable_first_stage_autocast: true
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 1
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: true
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        use_linear_in_transformer: true
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: true
+        use_spatial_context: true
+        merge_strategy: learned_with_images
+        video_kernel_size:
+        - 3
+        - 1
+        - 1
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: false
+          ucg_rate: 0.2
+          input_key: cond_frames_without_noise
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: true
+        - input_key: fps_id
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: motion_bucket_id
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+        - input_key: cond_frames
+          is_trainable: false
+          ucg_rate: 0.2
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: true
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: true
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: true
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult:
+                  - 1
+                  - 2
+                  - 4
+                  - 4
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: cond_aug
+          is_trainable: true
+          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: true
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult:
+            - 1
+            - 2
+            - 4
+            - 4
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size:
+            - 3
+            - 1
+            - 1
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 30
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            min_scale: 1.0
+            num_frames: 24
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
+      params:
+        batch2model_keys:
+        - num_video_frames
+        - image_only_indicator
+        loss_weighting_config:
+          target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
+          params:
+            sigma_data: 1.0
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            p_mean: 0.3
+            p_std: 1.2
+data:
+  target: sgm.data.objaverse.ObjaverseSpiralDataset
+  params:
+    root_dir: /mnt/mfs/zilong.chen/Downloads/objaverse-ndd-samples
+    random_front: true
+    batch_size: 2
+    num_workers: 16
+    cond_aug_mean: -0.0

mesh_recon/configs/neuralangelo-ortho-wmask.yaml ADDED Viewed

	@@ -0,0 +1,145 @@

+name: ${basename:${dataset.scene}}
+tag: ""
+seed: 42
+dataset:
+  name: ortho
+  root_dir: /home/xiaoxiao/Workplace/wonder3Dplus/outputs/joint-twice/aigc/cropsize-224-cfg1.0
+  cam_pose_dir: null
+  scene: scene_name
+  imSize: [1024, 1024]  # should use larger res, otherwise the exported mesh has wrong colors
+  camera_type: ortho
+  apply_mask: true
+  camera_params: null
+  view_weights: [1.0, 0.8, 0.2, 1.0, 0.4, 0.7]  #['front', 'front_right', 'right', 'back', 'left', 'front_left']
+  # view_weights: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+model:
+  name: neus
+  radius: 1.0
+  num_samples_per_ray: 1024
+  train_num_rays: 256
+  max_train_num_rays: 8192
+  grid_prune: true
+  grid_prune_occ_thre: 0.001
+  dynamic_ray_sampling: true
+  batch_image_sampling: true
+  randomized: true
+  ray_chunk: 2048
+  cos_anneal_end: 20000
+  learned_background: false
+  background_color: black
+  variance:
+    init_val: 0.3
+    modulate: false
+  geometry:
+    name: volume-sdf
+    radius: ${model.radius}
+    feature_dim: 13
+    grad_type: finite_difference
+    finite_difference_eps: progressive
+    isosurface:
+      method: mc
+      resolution: 192
+      chunk: 2097152
+      threshold: 0.
+    xyz_encoding_config:
+      otype: ProgressiveBandHashGrid
+      n_levels: 10 # 12 modify
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 32
+      per_level_scale: 1.3195079107728942
+      include_xyz: true
+      start_level: 4
+      start_step: 0
+      update_steps: 1000
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 1
+      sphere_init: true
+      sphere_init_radius: 0.5
+      weight_norm: true
+  texture:
+    name: volume-radiance
+    input_feature_dim: ${add:${model.geometry.feature_dim},3} # surface normal as additional input
+    dir_encoding_config:
+      otype: SphericalHarmonics
+      degree: 4
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 2
+    color_activation: sigmoid
+system:
+  name: ortho-neus-system
+  loss:
+    lambda_rgb_mse: 0.5
+    lambda_rgb_l1: 0.
+    lambda_mask: 1.0
+    lambda_eikonal: 0.2  # cannot be too large, will cause holes to thin objects
+    lambda_normal: 1.0  # cannot be too large
+    lambda_3d_normal_smooth: 1.0
+    # lambda_curvature: [0, 0.0, 1.e-4, 1000] # topology warmup
+    lambda_curvature: 0.
+    lambda_sparsity: 0.5
+    lambda_distortion: 0.0
+    lambda_distortion_bg: 0.0
+    lambda_opaque: 0.0
+    sparsity_scale: 100.0
+    geo_aware: true
+    rgb_p_ratio: 0.8
+    normal_p_ratio: 0.8
+    mask_p_ratio: 0.9
+  optimizer:
+    name: AdamW
+    args:
+      lr: 0.01
+      betas: [0.9, 0.99]
+      eps: 1.e-15
+    params:
+      geometry:
+        lr: 0.001
+      texture:
+        lr: 0.01
+      variance:
+        lr: 0.001
+  constant_steps: 500
+  scheduler:
+    name: SequentialLR
+    interval: step
+    milestones:
+      - ${system.constant_steps}
+    schedulers:
+      - name: ConstantLR
+        args:
+          factor: 1.0
+          total_iters: ${system.constant_steps}
+      - name: ExponentialLR
+        args:
+          gamma: ${calc_exp_lr_decay_rate:0.1,${sub:${trainer.max_steps},${system.constant_steps}}}
+checkpoint:
+  save_top_k: -1
+  every_n_train_steps: ${trainer.max_steps}
+export:
+  chunk_size: 2097152
+  export_vertex_color: True
+  ortho_scale: 1.35   #modify
+trainer:
+  max_steps: 3000
+  log_every_n_steps: 100
+  num_sanity_val_steps: 0
+  val_check_interval: 4000
+  limit_train_batches: 1.0
+  limit_val_batches: 2
+  enable_progress_bar: true
+  precision: 16

mesh_recon/configs/v3d.yaml ADDED Viewed

	@@ -0,0 +1,144 @@

+name: ${basename:${dataset.scene}}
+tag: ""
+seed: 42
+dataset:
+  name: v3d
+  root_dir: ./spirals
+  cam_pose_dir: null
+  scene: pizza_man
+  apply_mask: true
+  train_split: train
+  test_split: train
+  val_split: train
+  img_wh: [1024, 1024]
+model:
+  name: neus
+  radius: 1.0 ## check this
+  num_samples_per_ray: 1024
+  train_num_rays: 256
+  max_train_num_rays: 8192
+  grid_prune: true
+  grid_prune_occ_thre: 0.001
+  dynamic_ray_sampling: true
+  batch_image_sampling: true
+  randomized: true
+  ray_chunk: 2048
+  cos_anneal_end: 20000
+  learned_background: false
+  background_color: black
+  variance:
+    init_val: 0.3
+    modulate: false
+  geometry:
+    name: volume-sdf
+    radius: ${model.radius}
+    feature_dim: 13
+    grad_type: finite_difference
+    finite_difference_eps: progressive
+    isosurface:
+      method: mc
+      resolution: 384
+      chunk: 2097152
+      threshold: 0.
+    xyz_encoding_config:
+      otype: ProgressiveBandHashGrid
+      n_levels: 10 # 12 modify
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 32
+      per_level_scale: 1.3195079107728942
+      include_xyz: true
+      start_level: 4
+      start_step: 0
+      update_steps: 1000
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 1
+      sphere_init: true
+      sphere_init_radius: 0.5
+      weight_norm: true
+  texture:
+    name: volume-radiance
+    input_feature_dim: ${add:${model.geometry.feature_dim},3} # surface normal as additional input
+    dir_encoding_config:
+      otype: SphericalHarmonics
+      degree: 4
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 2
+    color_activation: sigmoid
+system:
+  name: videonvs-neus-system
+  loss:
+    lambda_rgb_mse: 0.5
+    lambda_rgb_l1: 0.
+    lambda_mask: 1.0
+    lambda_eikonal: 0.2  # cannot be too large, will cause holes to thin objects
+    lambda_normal: 0.0  # cannot be too large
+    lambda_3d_normal_smooth: 1.0
+    # lambda_curvature: [0, 0.0, 1.e-4, 1000] # topology warmup
+    lambda_curvature: 0.
+    lambda_sparsity: 0.5
+    lambda_distortion: 0.0
+    lambda_distortion_bg: 0.0
+    lambda_opaque: 0.0
+    sparsity_scale: 100.0
+    geo_aware: true
+    rgb_p_ratio: 0.8
+    normal_p_ratio: 0.8
+    mask_p_ratio: 0.9
+  optimizer:
+    name: AdamW
+    args:
+      lr: 0.01
+      betas: [0.9, 0.99]
+      eps: 1.e-15
+    params:
+      geometry:
+        lr: 0.001
+      texture:
+        lr: 0.01
+      variance:
+        lr: 0.001
+  constant_steps: 500
+  scheduler:
+    name: SequentialLR
+    interval: step
+    milestones:
+      - ${system.constant_steps}
+    schedulers:
+      - name: ConstantLR
+        args:
+          factor: 1.0
+          total_iters: ${system.constant_steps}
+      - name: ExponentialLR
+        args:
+          gamma: ${calc_exp_lr_decay_rate:0.1,${sub:${trainer.max_steps},${system.constant_steps}}}
+checkpoint:
+  save_top_k: -1
+  every_n_train_steps: ${trainer.max_steps}
+export:
+  chunk_size: 2097152
+  export_vertex_color: True
+  ortho_scale: null   #modify
+trainer:
+  max_steps: 3000
+  log_every_n_steps: 100
+  num_sanity_val_steps: 0
+  val_check_interval: 3000
+  limit_train_batches: 1.0
+  limit_val_batches: 2
+  enable_progress_bar: true
+  precision: 16

mesh_recon/configs/videonvs.yaml ADDED Viewed

	@@ -0,0 +1,144 @@

+name: ${basename:${dataset.scene}}
+tag: ""
+seed: 42
+dataset:
+  name: videonvs
+  root_dir: ./spirals
+  cam_pose_dir: null
+  scene: pizza_man
+  apply_mask: true
+  train_split: train
+  test_split: train
+  val_split: train
+  img_wh: [1024, 1024]
+model:
+  name: neus
+  radius: 1.0 ## check this
+  num_samples_per_ray: 1024
+  train_num_rays: 256
+  max_train_num_rays: 8192
+  grid_prune: true
+  grid_prune_occ_thre: 0.001
+  dynamic_ray_sampling: true
+  batch_image_sampling: true
+  randomized: true
+  ray_chunk: 2048
+  cos_anneal_end: 20000
+  learned_background: false
+  background_color: black
+  variance:
+    init_val: 0.3
+    modulate: false
+  geometry:
+    name: volume-sdf
+    radius: ${model.radius}
+    feature_dim: 13
+    grad_type: finite_difference
+    finite_difference_eps: progressive
+    isosurface:
+      method: mc
+      resolution: 384
+      chunk: 2097152
+      threshold: 0.
+    xyz_encoding_config:
+      otype: ProgressiveBandHashGrid
+      n_levels: 10 # 12 modify
+      n_features_per_level: 2
+      log2_hashmap_size: 19
+      base_resolution: 32
+      per_level_scale: 1.3195079107728942
+      include_xyz: true
+      start_level: 4
+      start_step: 0
+      update_steps: 1000
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 1
+      sphere_init: true
+      sphere_init_radius: 0.5
+      weight_norm: true
+  texture:
+    name: volume-radiance
+    input_feature_dim: ${add:${model.geometry.feature_dim},3} # surface normal as additional input
+    dir_encoding_config:
+      otype: SphericalHarmonics
+      degree: 4
+    mlp_network_config:
+      otype: VanillaMLP
+      activation: ReLU
+      output_activation: none
+      n_neurons: 64
+      n_hidden_layers: 2
+    color_activation: sigmoid
+system:
+  name: videonvs-neus-system
+  loss:
+    lambda_rgb_mse: 0.5
+    lambda_rgb_l1: 0.
+    lambda_mask: 1.0
+    lambda_eikonal: 0.2  # cannot be too large, will cause holes to thin objects
+    lambda_normal: 1.0  # cannot be too large
+    lambda_3d_normal_smooth: 1.0
+    # lambda_curvature: [0, 0.0, 1.e-4, 1000] # topology warmup
+    lambda_curvature: 0.
+    lambda_sparsity: 0.5
+    lambda_distortion: 0.0
+    lambda_distortion_bg: 0.0
+    lambda_opaque: 0.0
+    sparsity_scale: 100.0
+    geo_aware: true
+    rgb_p_ratio: 0.8
+    normal_p_ratio: 0.8
+    mask_p_ratio: 0.9
+  optimizer:
+    name: AdamW
+    args:
+      lr: 0.01
+      betas: [0.9, 0.99]
+      eps: 1.e-15
+    params:
+      geometry:
+        lr: 0.001
+      texture:
+        lr: 0.01
+      variance:
+        lr: 0.001
+  constant_steps: 500
+  scheduler:
+    name: SequentialLR
+    interval: step
+    milestones:
+      - ${system.constant_steps}
+    schedulers:
+      - name: ConstantLR
+        args:
+          factor: 1.0
+          total_iters: ${system.constant_steps}
+      - name: ExponentialLR
+        args:
+          gamma: ${calc_exp_lr_decay_rate:0.1,${sub:${trainer.max_steps},${system.constant_steps}}}
+checkpoint:
+  save_top_k: -1
+  every_n_train_steps: ${trainer.max_steps}
+export:
+  chunk_size: 2097152
+  export_vertex_color: True
+  ortho_scale: null   #modify
+trainer:
+  max_steps: 3000
+  log_every_n_steps: 100
+  num_sanity_val_steps: 0
+  val_check_interval: 3000
+  limit_train_batches: 1.0
+  limit_val_batches: 2
+  enable_progress_bar: true
+  precision: 16

mesh_recon/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+datasets = {}
+def register(name):
+    def decorator(cls):
+        datasets[name] = cls
+        return cls
+    return decorator
+def make(name, config):
+    dataset = datasets[name](config)
+    return dataset
+from . import blender, colmap, dtu, ortho, videonvs, videonvs_co3d, v3d

mesh_recon/datasets/blender.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+class BlenderDatasetBase:
+    def setup(self, config, split):
+        self.config = config
+        self.split = split
+        self.rank = get_rank()
+        self.has_mask = True
+        self.apply_mask = True
+        with open(
+            os.path.join(self.config.root_dir, f"transforms_{self.split}.json"), "r"
+        ) as f:
+            meta = json.load(f)
+        if "w" in meta and "h" in meta:
+            W, H = int(meta["w"]), int(meta["h"])
+        else:
+            W, H = 800, 800
+        if "img_wh" in self.config:
+            w, h = self.config.img_wh
+            assert round(W / w * h) == H
+        elif "img_downscale" in self.config:
+            w, h = W // self.config.img_downscale, H // self.config.img_downscale
+        else:
+            raise KeyError("Either img_wh or img_downscale should be specified.")
+        self.w, self.h = w, h
+        self.img_wh = (self.w, self.h)
+        self.near, self.far = self.config.near_plane, self.config.far_plane
+        self.focal = (
+            0.5 * w / math.tan(0.5 * meta["camera_angle_x"])
+        )  # scaled focal length
+        # ray directions for all pixels, same for all images (same H, W, focal)
+        self.directions = get_ray_directions(
+            self.w, self.h, self.focal, self.focal, self.w // 2, self.h // 2
+        ).to(
+            self.rank
+        )  # (h, w, 3)
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        for i, frame in enumerate(meta["frames"]):
+            c2w = torch.from_numpy(np.array(frame["transform_matrix"])[:3, :4])
+            self.all_c2w.append(c2w)
+            img_path = os.path.join(self.config.root_dir, f"{frame['file_path']}.png")
+            img = Image.open(img_path)
+            img = img.resize(self.img_wh, Image.BICUBIC)
+            img = TF.to_tensor(img).permute(1, 2, 0)  # (4, h, w) => (h, w, 4)
+            self.all_fg_masks.append(img[..., -1])  # (h, w)
+            self.all_images.append(img[..., :3])
+        self.all_c2w, self.all_images, self.all_fg_masks = (
+            torch.stack(self.all_c2w, dim=0).float().to(self.rank),
+            torch.stack(self.all_images, dim=0).float().to(self.rank),
+            torch.stack(self.all_fg_masks, dim=0).float().to(self.rank),
+        )
+class BlenderDataset(Dataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {"index": index}
+class BlenderIterableDataset(IterableDataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register("blender")
+class VideoNVSDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, "fit"]:
+            self.train_dataset = BlenderIterableDataset(
+                self.config, self.config.train_split
+            )
+        if stage in [None, "fit", "validate"]:
+            self.val_dataset = BlenderDataset(self.config, self.config.val_split)
+        if stage in [None, "test"]:
+            self.test_dataset = BlenderDataset(self.config, self.config.test_split)
+        if stage in [None, "predict"]:
+            self.predict_dataset = BlenderDataset(self.config, self.config.train_split)
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler,
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/colmap.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import os
+import math
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+import pytorch_lightning as pl
+import datasets
+from datasets.colmap_utils import \
+    read_cameras_binary, read_images_binary, read_points3d_binary
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+def get_center(pts):
+    center = pts.mean(0)
+    dis = (pts - center[None,:]).norm(p=2, dim=-1)
+    mean, std = dis.mean(), dis.std()
+    q25, q75 = torch.quantile(dis, 0.25), torch.quantile(dis, 0.75)
+    valid = (dis > mean - 1.5 * std) & (dis < mean + 1.5 * std) & (dis > mean - (q75 - q25) * 1.5) & (dis < mean + (q75 - q25) * 1.5)
+    center = pts[valid].mean(0)
+    return center
+def normalize_poses(poses, pts, up_est_method, center_est_method):
+    if center_est_method == 'camera':
+        # estimation scene center as the average of all camera positions
+        center = poses[...,3].mean(0)
+    elif center_est_method == 'lookat':
+        # estimation scene center as the average of the intersection of selected pairs of camera rays
+        cams_ori = poses[...,3]
+        cams_dir = poses[:,:3,:3] @ torch.as_tensor([0.,0.,-1.])
+        cams_dir = F.normalize(cams_dir, dim=-1)
+        A = torch.stack([cams_dir, -cams_dir.roll(1,0)], dim=-1)
+        b = -cams_ori + cams_ori.roll(1,0)
+        t = torch.linalg.lstsq(A, b).solution
+        center = (torch.stack([cams_dir, cams_dir.roll(1,0)], dim=-1) * t[:,None,:] + torch.stack([cams_ori, cams_ori.roll(1,0)], dim=-1)).mean((0,2))
+    elif center_est_method == 'point':
+        # first estimation scene center as the average of all camera positions
+        # later we'll use the center of all points bounded by the cameras as the final scene center
+        center = poses[...,3].mean(0)
+    else:
+        raise NotImplementedError(f'Unknown center estimation method: {center_est_method}')
+    if up_est_method == 'ground':
+        # estimate up direction as the normal of the estimated ground plane
+        # use RANSAC to estimate the ground plane in the point cloud
+        import pyransac3d as pyrsc
+        ground = pyrsc.Plane()
+        plane_eq, inliers = ground.fit(pts.numpy(), thresh=0.01) # TODO: determine thresh based on scene scale
+        plane_eq = torch.as_tensor(plane_eq) # A, B, C, D in Ax + By + Cz + D = 0
+        z = F.normalize(plane_eq[:3], dim=-1) # plane normal as up direction
+        signed_distance = (torch.cat([pts, torch.ones_like(pts[...,0:1])], dim=-1) * plane_eq).sum(-1)
+        if signed_distance.mean() < 0:
+            z = -z # flip the direction if points lie under the plane
+    elif up_est_method == 'camera':
+        # estimate up direction as the average of all camera up directions
+        z = F.normalize((poses[...,3] - center).mean(0), dim=0)
+    else:
+        raise NotImplementedError(f'Unknown up estimation method: {up_est_method}')
+    # new axis
+    y_ = torch.as_tensor([z[1], -z[0], 0.])
+    x = F.normalize(y_.cross(z), dim=0)
+    y = z.cross(x)
+    if center_est_method == 'point':
+        # rotation
+        Rc = torch.stack([x, y, z], dim=1)
+        R = Rc.T
+        poses_homo = torch.cat([poses, torch.as_tensor([[[0.,0.,0.,1.]]]).expand(poses.shape[0], -1, -1)], dim=1)
+        inv_trans = torch.cat([torch.cat([R, torch.as_tensor([[0.,0.,0.]]).T], dim=1), torch.as_tensor([[0.,0.,0.,1.]])], dim=0)
+        poses_norm = (inv_trans @ poses_homo)[:,:3]
+        pts = (inv_trans @ torch.cat([pts, torch.ones_like(pts[:,0:1])], dim=-1)[...,None])[:,:3,0]
+        # translation and scaling
+        poses_min, poses_max = poses_norm[...,3].min(0)[0], poses_norm[...,3].max(0)[0]
+        pts_fg = pts[(poses_min[0] < pts[:,0]) & (pts[:,0] < poses_max[0]) & (poses_min[1] < pts[:,1]) & (pts[:,1] < poses_max[1])]
+        center = get_center(pts_fg)
+        tc = center.reshape(3, 1)
+        t = -tc
+        poses_homo = torch.cat([poses_norm, torch.as_tensor([[[0.,0.,0.,1.]]]).expand(poses_norm.shape[0], -1, -1)], dim=1)
+        inv_trans = torch.cat([torch.cat([torch.eye(3), t], dim=1), torch.as_tensor([[0.,0.,0.,1.]])], dim=0)
+        poses_norm = (inv_trans @ poses_homo)[:,:3]
+        scale = poses_norm[...,3].norm(p=2, dim=-1).min()
+        poses_norm[...,3] /= scale
+        pts = (inv_trans @ torch.cat([pts, torch.ones_like(pts[:,0:1])], dim=-1)[...,None])[:,:3,0]
+        pts = pts / scale
+    else:
+        # rotation and translation
+        Rc = torch.stack([x, y, z], dim=1)
+        tc = center.reshape(3, 1)
+        R, t = Rc.T, -Rc.T @ tc
+        poses_homo = torch.cat([poses, torch.as_tensor([[[0.,0.,0.,1.]]]).expand(poses.shape[0], -1, -1)], dim=1)
+        inv_trans = torch.cat([torch.cat([R, t], dim=1), torch.as_tensor([[0.,0.,0.,1.]])], dim=0)
+        poses_norm = (inv_trans @ poses_homo)[:,:3] # (N_images, 4, 4)
+        # scaling
+        scale = poses_norm[...,3].norm(p=2, dim=-1).min()
+        poses_norm[...,3] /= scale
+        # apply the transformation to the point cloud
+        pts = (inv_trans @ torch.cat([pts, torch.ones_like(pts[:,0:1])], dim=-1)[...,None])[:,:3,0]
+        pts = pts / scale
+    return poses_norm, pts
+def create_spheric_poses(cameras, n_steps=120):
+    center = torch.as_tensor([0.,0.,0.], dtype=cameras.dtype, device=cameras.device)
+    mean_d = (cameras - center[None,:]).norm(p=2, dim=-1).mean()
+    mean_h = cameras[:,2].mean()
+    r = (mean_d**2 - mean_h**2).sqrt()
+    up = torch.as_tensor([0., 0., 1.], dtype=center.dtype, device=center.device)
+    all_c2w = []
+    for theta in torch.linspace(0, 2 * math.pi, n_steps):
+        cam_pos = torch.stack([r * theta.cos(), r * theta.sin(), mean_h])
+        l = F.normalize(center - cam_pos, p=2, dim=0)
+        s = F.normalize(l.cross(up), p=2, dim=0)
+        u = F.normalize(s.cross(l), p=2, dim=0)
+        c2w = torch.cat([torch.stack([s, u, -l], dim=1), cam_pos[:,None]], axis=1)
+        all_c2w.append(c2w)
+    all_c2w = torch.stack(all_c2w, dim=0)
+    return all_c2w
+class ColmapDatasetBase():
+    # the data only has to be processed once
+    initialized = False
+    properties = {}
+    def setup(self, config, split):
+        self.config = config
+        self.split = split
+        self.rank = get_rank()
+        if not ColmapDatasetBase.initialized:
+            camdata = read_cameras_binary(os.path.join(self.config.root_dir, 'sparse/0/cameras.bin'))
+            H = int(camdata[1].height)
+            W = int(camdata[1].width)
+            if 'img_wh' in self.config:
+                w, h = self.config.img_wh
+                assert round(W / w * h) == H
+            elif 'img_downscale' in self.config:
+                w, h = int(W / self.config.img_downscale + 0.5), int(H / self.config.img_downscale + 0.5)
+            else:
+                raise KeyError("Either img_wh or img_downscale should be specified.")
+            img_wh = (w, h)
+            factor = w / W
+            if camdata[1].model == 'SIMPLE_RADIAL':
+                fx = fy = camdata[1].params[0] * factor
+                cx = camdata[1].params[1] * factor
+                cy = camdata[1].params[2] * factor
+            elif camdata[1].model in ['PINHOLE', 'OPENCV']:
+                fx = camdata[1].params[0] * factor
+                fy = camdata[1].params[1] * factor
+                cx = camdata[1].params[2] * factor
+                cy = camdata[1].params[3] * factor
+            else:
+                raise ValueError(f"Please parse the intrinsics for camera model {camdata[1].model}!")
+            directions = get_ray_directions(w, h, fx, fy, cx, cy).to(self.rank)
+            imdata = read_images_binary(os.path.join(self.config.root_dir, 'sparse/0/images.bin'))
+            mask_dir = os.path.join(self.config.root_dir, 'masks')
+            has_mask = os.path.exists(mask_dir) # TODO: support partial masks
+            apply_mask = has_mask and self.config.apply_mask
+            all_c2w, all_images, all_fg_masks = [], [], []
+            for i, d in enumerate(imdata.values()):
+                R = d.qvec2rotmat()
+                t = d.tvec.reshape(3, 1)
+                c2w = torch.from_numpy(np.concatenate([R.T, -R.T@t], axis=1)).float()
+                c2w[:,1:3] *= -1. # COLMAP => OpenGL
+                all_c2w.append(c2w)
+                if self.split in ['train', 'val']:
+                    img_path = os.path.join(self.config.root_dir, 'images', d.name)
+                    img = Image.open(img_path)
+                    img = img.resize(img_wh, Image.BICUBIC)
+                    img = TF.to_tensor(img).permute(1, 2, 0)[...,:3]
+                    img = img.to(self.rank) if self.config.load_data_on_gpu else img.cpu()
+                    if has_mask:
+                        mask_paths = [os.path.join(mask_dir, d.name), os.path.join(mask_dir, d.name[3:])]
+                        mask_paths = list(filter(os.path.exists, mask_paths))
+                        assert len(mask_paths) == 1
+                        mask = Image.open(mask_paths[0]).convert('L') # (H, W, 1)
+                        mask = mask.resize(img_wh, Image.BICUBIC)
+                        mask = TF.to_tensor(mask)[0]
+                    else:
+                        mask = torch.ones_like(img[...,0], device=img.device)
+                    all_fg_masks.append(mask) # (h, w)
+                    all_images.append(img)
+            all_c2w = torch.stack(all_c2w, dim=0)
+            pts3d = read_points3d_binary(os.path.join(self.config.root_dir, 'sparse/0/points3D.bin'))
+            pts3d = torch.from_numpy(np.array([pts3d[k].xyz for k in pts3d])).float()
+            all_c2w, pts3d = normalize_poses(all_c2w, pts3d, up_est_method=self.config.up_est_method, center_est_method=self.config.center_est_method)
+            ColmapDatasetBase.properties = {
+                'w': w,
+                'h': h,
+                'img_wh': img_wh,
+                'factor': factor,
+                'has_mask': has_mask,
+                'apply_mask': apply_mask,
+                'directions': directions,
+                'pts3d': pts3d,
+                'all_c2w': all_c2w,
+                'all_images': all_images,
+                'all_fg_masks': all_fg_masks
+            }
+            ColmapDatasetBase.initialized = True
+        for k, v in ColmapDatasetBase.properties.items():
+            setattr(self, k, v)
+        if self.split == 'test':
+            self.all_c2w = create_spheric_poses(self.all_c2w[:,:,3], n_steps=self.config.n_test_traj_steps)
+            self.all_images = torch.zeros((self.config.n_test_traj_steps, self.h, self.w, 3), dtype=torch.float32)
+            self.all_fg_masks = torch.zeros((self.config.n_test_traj_steps, self.h, self.w), dtype=torch.float32)
+        else:
+            self.all_images, self.all_fg_masks = torch.stack(self.all_images, dim=0).float(), torch.stack(self.all_fg_masks, dim=0).float()
+        """
+        # for debug use
+        from models.ray_utils import get_rays
+        rays_o, rays_d = get_rays(self.directions.cpu(), self.all_c2w, keepdim=True)
+        pts_out = []
+        pts_out.append('\n'.join([' '.join([str(p) for p in l]) + ' 1.0 0.0 0.0' for l in rays_o[:,0,0].reshape(-1, 3).tolist()]))
+        t_vals = torch.linspace(0, 1, 8)
+        z_vals = 0.05 * (1 - t_vals) + 0.5 * t_vals
+        ray_pts = (rays_o[:,0,0][..., None, :] + z_vals[..., None] * rays_d[:,0,0][..., None, :])
+        pts_out.append('\n'.join([' '.join([str(p) for p in l]) + ' 0.0 1.0 0.0' for l in ray_pts.view(-1, 3).tolist()]))
+        ray_pts = (rays_o[:,0,0][..., None, :] + z_vals[..., None] * rays_d[:,self.h-1,0][..., None, :])
+        pts_out.append('\n'.join([' '.join([str(p) for p in l]) + ' 0.0 0.0 1.0' for l in ray_pts.view(-1, 3).tolist()]))
+        ray_pts = (rays_o[:,0,0][..., None, :] + z_vals[..., None] * rays_d[:,0,self.w-1][..., None, :])
+        pts_out.append('\n'.join([' '.join([str(p) for p in l]) + ' 0.0 1.0 1.0' for l in ray_pts.view(-1, 3).tolist()]))
+        ray_pts = (rays_o[:,0,0][..., None, :] + z_vals[..., None] * rays_d[:,self.h-1,self.w-1][..., None, :])
+        pts_out.append('\n'.join([' '.join([str(p) for p in l]) + ' 1.0 1.0 1.0' for l in ray_pts.view(-1, 3).tolist()]))
+        open('cameras.txt', 'w').write('\n'.join(pts_out))
+        open('scene.txt', 'w').write('\n'.join([' '.join([str(p) for p in l]) + ' 0.0 0.0 0.0' for l in self.pts3d.view(-1, 3).tolist()]))
+        exit(1)
+        """
+        self.all_c2w = self.all_c2w.float().to(self.rank)
+        if self.config.load_data_on_gpu:
+            self.all_images = self.all_images.to(self.rank)
+            self.all_fg_masks = self.all_fg_masks.to(self.rank)
+class ColmapDataset(Dataset, ColmapDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {
+            'index': index
+        }
+class ColmapIterableDataset(IterableDataset, ColmapDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register('colmap')
+class ColmapDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, 'fit']:
+            self.train_dataset = ColmapIterableDataset(self.config, 'train')
+        if stage in [None, 'fit', 'validate']:
+            self.val_dataset = ColmapDataset(self.config, self.config.get('val_split', 'train'))
+        if stage in [None, 'test']:
+            self.test_dataset = ColmapDataset(self.config, self.config.get('test_split', 'test'))
+        if stage in [None, 'predict']:
+            self.predict_dataset = ColmapDataset(self.config, 'train')
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/colmap_utils.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Johannes L. Schoenberger (jsch at inf.ethz.ch)
+import os
+import collections
+import numpy as np
+import struct
+CameraModel = collections.namedtuple(
+    "CameraModel", ["model_id", "model_name", "num_params"])
+Camera = collections.namedtuple(
+    "Camera", ["id", "model", "width", "height", "params"])
+BaseImage = collections.namedtuple(
+    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
+Point3D = collections.namedtuple(
+    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
+class Image(BaseImage):
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
+    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
+    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
+    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
+    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
+    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
+    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
+    CameraModel(model_id=7, model_name="FOV", num_params=5),
+    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
+    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
+    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) \
+                         for camera_model in CAMERA_MODELS])
+def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+def read_cameras_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    cameras = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(id=camera_id, model=model,
+                                            width=width, height=height,
+                                            params=params)
+    return cameras
+def read_cameras_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_cameras = read_next_bytes(fid, 8, "Q")[0]
+        for camera_line_index in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence="iiQQ")
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(fid, num_bytes=8*num_params,
+                                     format_char_sequence="d"*num_params)
+            cameras[camera_id] = Camera(id=camera_id,
+                                        model=model_name,
+                                        width=width,
+                                        height=height,
+                                        params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+def read_images_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    images = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([tuple(map(float, elems[0::3])),
+                                       tuple(map(float, elems[1::3]))])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id, qvec=qvec, tvec=tvec,
+                    camera_id=camera_id, name=image_name,
+                    xys=xys, point3D_ids=point3D_ids)
+    return images
+def read_images_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for image_index in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi")
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":   # look for the ASCII 0 entry
+                image_name += current_char.decode("utf-8")
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            num_points2D = read_next_bytes(fid, num_bytes=8,
+                                           format_char_sequence="Q")[0]
+            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
+                                       format_char_sequence="ddq"*num_points2D)
+            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
+                                   tuple(map(float, x_y_id_s[1::3]))])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id, qvec=qvec, tvec=tvec,
+                camera_id=camera_id, name=image_name,
+                xys=xys, point3D_ids=point3D_ids)
+    return images
+def read_points3D_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    points3D = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,
+                                               error=error, image_ids=image_ids,
+                                               point2D_idxs=point2D_idxs)
+    return points3D
+def read_points3d_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        for point_line_index in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd")
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q")[0]
+            track_elems = read_next_bytes(
+                fid, num_bytes=8*track_length,
+                format_char_sequence="ii"*track_length)
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id, xyz=xyz, rgb=rgb,
+                error=error, image_ids=image_ids,
+                point2D_idxs=point2D_idxs)
+    return points3D
+def read_model(path, ext):
+    if ext == ".txt":
+        cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
+        images = read_images_text(os.path.join(path, "images" + ext))
+        points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
+        images = read_images_binary(os.path.join(path, "images" + ext))
+        points3D = read_points3d_binary(os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+def qvec2rotmat(qvec):
+    return np.array([
+        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
+         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
+        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
+         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
+        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = np.array([
+        [Rxx - Ryy - Rzz, 0, 0, 0],
+        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec

mesh_recon/datasets/dtu.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import cv2
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+def load_K_Rt_from_P(P=None):
+    out = cv2.decomposeProjectionMatrix(P)
+    K = out[0]
+    R = out[1]
+    t = out[2]
+    K = K / K[2, 2]
+    intrinsics = np.eye(4)
+    intrinsics[:3, :3] = K
+    pose = np.eye(4, dtype=np.float32)
+    pose[:3, :3] = R.transpose()
+    pose[:3, 3] = (t[:3] / t[3])[:, 0]
+    return intrinsics, pose
+def create_spheric_poses(cameras, n_steps=120):
+    center = torch.as_tensor([0.,0.,0.], dtype=cameras.dtype, device=cameras.device)
+    cam_center = F.normalize(cameras.mean(0), p=2, dim=-1) * cameras.mean(0).norm(2)
+    eigvecs = torch.linalg.eig(cameras.T @ cameras).eigenvectors
+    rot_axis = F.normalize(eigvecs[:,1].real.float(), p=2, dim=-1)
+    up = rot_axis
+    rot_dir = torch.cross(rot_axis, cam_center)
+    max_angle = (F.normalize(cameras, p=2, dim=-1) * F.normalize(cam_center, p=2, dim=-1)).sum(-1).acos().max()
+    all_c2w = []
+    for theta in torch.linspace(-max_angle, max_angle, n_steps):
+        cam_pos = cam_center * math.cos(theta) + rot_dir * math.sin(theta)
+        l = F.normalize(center - cam_pos, p=2, dim=0)
+        s = F.normalize(l.cross(up), p=2, dim=0)
+        u = F.normalize(s.cross(l), p=2, dim=0)
+        c2w = torch.cat([torch.stack([s, u, -l], dim=1), cam_pos[:,None]], axis=1)
+        all_c2w.append(c2w)
+    all_c2w = torch.stack(all_c2w, dim=0)
+    return all_c2w
+class DTUDatasetBase():
+    def setup(self, config, split):
+        self.config = config
+        self.split = split
+        self.rank = get_rank()
+        cams = np.load(os.path.join(self.config.root_dir, self.config.cameras_file))
+        img_sample = cv2.imread(os.path.join(self.config.root_dir, 'image', '000000.png'))
+        H, W = img_sample.shape[0], img_sample.shape[1]
+        if 'img_wh' in self.config:
+            w, h = self.config.img_wh
+            assert round(W / w * h) == H
+        elif 'img_downscale' in self.config:
+            w, h = int(W / self.config.img_downscale + 0.5), int(H / self.config.img_downscale + 0.5)
+        else:
+            raise KeyError("Either img_wh or img_downscale should be specified.")
+        self.w, self.h = w, h
+        self.img_wh = (w, h)
+        self.factor = w / W
+        mask_dir = os.path.join(self.config.root_dir, 'mask')
+        self.has_mask = True
+        self.apply_mask = self.config.apply_mask
+        self.directions = []
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        n_images = max([int(k.split('_')[-1]) for k in cams.keys()]) + 1
+        for i in range(n_images):
+            world_mat, scale_mat = cams[f'world_mat_{i}'], cams[f'scale_mat_{i}']
+            P = (world_mat @ scale_mat)[:3,:4]
+            K, c2w = load_K_Rt_from_P(P)
+            fx, fy, cx, cy = K[0,0] * self.factor, K[1,1] * self.factor, K[0,2] * self.factor, K[1,2] * self.factor
+            directions = get_ray_directions(w, h, fx, fy, cx, cy)
+            self.directions.append(directions)
+            c2w = torch.from_numpy(c2w).float()
+            # blender follows opengl camera coordinates (right up back)
+            # NeuS DTU data coordinate system (right down front) is different from blender
+            # https://github.com/Totoro97/NeuS/issues/9
+            # for c2w, flip the sign of input camera coordinate yz
+            c2w_ = c2w.clone()
+            c2w_[:3,1:3] *= -1. # flip input sign
+            self.all_c2w.append(c2w_[:3,:4])
+            if self.split in ['train', 'val']:
+                img_path = os.path.join(self.config.root_dir, 'image', f'{i:06d}.png')
+                img = Image.open(img_path)
+                img = img.resize(self.img_wh, Image.BICUBIC)
+                img = TF.to_tensor(img).permute(1, 2, 0)[...,:3]
+                mask_path = os.path.join(mask_dir, f'{i:03d}.png')
+                mask = Image.open(mask_path).convert('L') # (H, W, 1)
+                mask = mask.resize(self.img_wh, Image.BICUBIC)
+                mask = TF.to_tensor(mask)[0]
+                self.all_fg_masks.append(mask) # (h, w)
+                self.all_images.append(img)
+        self.all_c2w = torch.stack(self.all_c2w, dim=0)
+        if self.split == 'test':
+            self.all_c2w = create_spheric_poses(self.all_c2w[:,:,3], n_steps=self.config.n_test_traj_steps)
+            self.all_images = torch.zeros((self.config.n_test_traj_steps, self.h, self.w, 3), dtype=torch.float32)
+            self.all_fg_masks = torch.zeros((self.config.n_test_traj_steps, self.h, self.w), dtype=torch.float32)
+            self.directions = self.directions[0]
+        else:
+            self.all_images, self.all_fg_masks = torch.stack(self.all_images, dim=0), torch.stack(self.all_fg_masks, dim=0)
+            self.directions = torch.stack(self.directions, dim=0)
+        self.directions = self.directions.float().to(self.rank)
+        self.all_c2w, self.all_images, self.all_fg_masks = \
+            self.all_c2w.float().to(self.rank), \
+            self.all_images.float().to(self.rank), \
+            self.all_fg_masks.float().to(self.rank)
+class DTUDataset(Dataset, DTUDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {
+            'index': index
+        }
+class DTUIterableDataset(IterableDataset, DTUDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register('dtu')
+class DTUDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, 'fit']:
+            self.train_dataset = DTUIterableDataset(self.config, 'train')
+        if stage in [None, 'fit', 'validate']:
+            self.val_dataset = DTUDataset(self.config, self.config.get('val_split', 'train'))
+        if stage in [None, 'test']:
+            self.test_dataset = DTUDataset(self.config, self.config.get('test_split', 'test'))
+        if stage in [None, 'predict']:
+            self.predict_dataset = DTUDataset(self.config, 'train')
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/fixed_poses/000_back_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-1.000000238418579102e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 -1.343588564850506373e-07 1.000000119209289551e+00 1.746665105883948854e-07
+0.000000000000000000e+00 1.000000119209289551e+00 -1.343588564850506373e-07 -1.300000071525573730e+00

mesh_recon/datasets/fixed_poses/000_back_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-7.071069478988647461e-01 -7.071068286895751953e-01 0.000000000000000000e+00 -1.192092895507812500e-07
+0.000000000000000000e+00 -7.587616579485256807e-08 1.000000119209289551e+00 9.863901340168013121e-08
+-7.071068286895751953e-01 7.071068286895751953e-01 -7.587616579485256807e-08 -1.838477730751037598e+00

mesh_recon/datasets/fixed_poses/000_back_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-7.071069478988647461e-01 7.071068286895751953e-01 0.000000000000000000e+00 1.192092895507812500e-07
+0.000000000000000000e+00 -7.587616579485256807e-08 1.000000119209289551e+00 9.863901340168013121e-08
+7.071068286895751953e-01 7.071068286895751953e-01 -7.587616579485256807e-08 -1.838477730751037598e+00

mesh_recon/datasets/fixed_poses/000_front_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+1.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 -1.343588564850506373e-07 1.000000119209289551e+00 -1.746665105883948854e-07
+0.000000000000000000e+00 -1.000000119209289551e+00 -1.343588564850506373e-07 -1.300000071525573730e+00

mesh_recon/datasets/fixed_poses/000_front_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+7.071067690849304199e-01 -7.071068286895751953e-01 0.000000000000000000e+00 -1.192092895507812500e-07
+0.000000000000000000e+00 -7.587616579485256807e-08 1.000000119209289551e+00 -9.863901340168013121e-08
+-7.071068286895751953e-01 -7.071068286895751953e-01 -7.587616579485256807e-08 -1.838477730751037598e+00

mesh_recon/datasets/fixed_poses/000_front_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+7.071067690849304199e-01 7.071068286895751953e-01 0.000000000000000000e+00 1.192092895507812500e-07
+0.000000000000000000e+00 -7.587616579485256807e-08 1.000000119209289551e+00 -9.863901340168013121e-08
+7.071068286895751953e-01 -7.071068286895751953e-01 -7.587616579485256807e-08 -1.838477730751037598e+00

mesh_recon/datasets/fixed_poses/000_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.220446049250313081e-16 -1.000000000000000000e+00 0.000000000000000000e+00 -2.886579758146288598e-16
+0.000000000000000000e+00 -2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00
+-1.000000000000000000e+00 0.000000000000000000e+00 -2.220446049250313081e-16 -1.299999952316284180e+00

mesh_recon/datasets/fixed_poses/000_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00 2.886579758146288598e-16
+0.000000000000000000e+00 -2.220446049250313081e-16 1.000000000000000000e+00 0.000000000000000000e+00
+1.000000000000000000e+00 0.000000000000000000e+00 -2.220446049250313081e-16 -1.299999952316284180e+00

mesh_recon/datasets/fixed_poses/000_top_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+1.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 1.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
+0.000000000000000000e+00 0.000000000000000000e+00 1.000000000000000000e+00 -1.299999952316284180e+00

mesh_recon/datasets/ortho.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import cv2
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ortho_ray_directions_origins, get_ortho_rays, get_ray_directions
+from utils.misc import get_rank
+from glob import glob
+import PIL.Image
+def camNormal2worldNormal(rot_c2w, camNormal):
+    H,W,_ = camNormal.shape
+    normal_img = np.matmul(rot_c2w[None, :, :], camNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    return normal_img
+def worldNormal2camNormal(rot_w2c, worldNormal):
+    H,W,_ = worldNormal.shape
+    normal_img = np.matmul(rot_w2c[None, :, :], worldNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    return normal_img
+def trans_normal(normal, RT_w2c, RT_w2c_target):
+    normal_world = camNormal2worldNormal(np.linalg.inv(RT_w2c[:3,:3]), normal)
+    normal_target_cam = worldNormal2camNormal(RT_w2c_target[:3,:3], normal_world)
+    return normal_target_cam
+def img2normal(img):
+    return (img/255.)*2-1
+def normal2img(normal):
+    return np.uint8((normal*0.5+0.5)*255)
+def norm_normalize(normal, dim=-1):
+    normal = normal/(np.linalg.norm(normal, axis=dim, keepdims=True)+1e-6)
+    return normal
+def RT_opengl2opencv(RT):
+     # Build the coordinate transform matrix from world to computer vision camera
+    # R_world2cv = R_bcam2cv@R_world2bcam
+    # T_world2cv = R_bcam2cv@T_world2bcam
+    R = RT[:3, :3]
+    t = RT[:3, 3]
+    R_bcam2cv = np.asarray([[1, 0, 0], [0, -1, 0], [0, 0, -1]], np.float32)
+    R_world2cv = R_bcam2cv @ R
+    t_world2cv = R_bcam2cv @ t
+    RT = np.concatenate([R_world2cv,t_world2cv[:,None]],1)
+    return RT
+def normal_opengl2opencv(normal):
+    H,W,C = np.shape(normal)
+    # normal_img = np.reshape(normal, (H*W,C))
+    R_bcam2cv = np.array([1, -1, -1], np.float32)
+    normal_cv = normal * R_bcam2cv[None, None, :]
+    print(np.shape(normal_cv))
+    return normal_cv
+def inv_RT(RT):
+    RT_h = np.concatenate([RT, np.array([[0,0,0,1]])], axis=0)
+    RT_inv = np.linalg.inv(RT_h)
+    return RT_inv[:3, :]
+def load_a_prediction(root_dir, test_object, imSize, view_types, load_color=False, cam_pose_dir=None,
+                         normal_system='front', erode_mask=True, camera_type='ortho', cam_params=None):
+    all_images = []
+    all_normals = []
+    all_normals_world = []
+    all_masks = []
+    all_color_masks = []
+    all_poses = []
+    all_w2cs = []
+    directions = []
+    ray_origins = []
+    RT_front = np.loadtxt(glob(os.path.join(cam_pose_dir, '*_%s_RT.txt'%( 'front')))[0])   # world2cam matrix
+    RT_front_cv = RT_opengl2opencv(RT_front)   # convert normal from opengl to opencv
+    for idx, view in enumerate(view_types):
+        print(os.path.join(root_dir,test_object))
+        normal_filepath = os.path.join(root_dir, test_object, 'normals_000_%s.png'%( view))
+        # Load key frame
+        if load_color:  # use bgr
+            image =np.array(PIL.Image.open(normal_filepath.replace("normals", "rgb")).resize(imSize))[:, :, :3]
+        normal = np.array(PIL.Image.open(normal_filepath).resize(imSize))
+        mask = normal[:, :, 3]
+        normal = normal[:, :, :3]
+        color_mask = np.array(PIL.Image.open(os.path.join(root_dir,test_object, 'masked_colors/rgb_000_%s.png'%( view))).resize(imSize))[:, :, 3]
+        invalid_color_mask = color_mask < 255*0.5
+        threshold =  np.ones_like(image[:, :, 0]) * 250
+        invalid_white_mask = (image[:, :, 0] > threshold) & (image[:, :, 1] > threshold) & (image[:, :, 2] > threshold)
+        invalid_color_mask_final = invalid_color_mask & invalid_white_mask
+        color_mask = (1 - invalid_color_mask_final) > 0
+        # if erode_mask:
+        #     kernel = np.ones((3, 3), np.uint8)
+        #     mask = cv2.erode(mask, kernel, iterations=1)
+        RT = np.loadtxt(os.path.join(cam_pose_dir, '000_%s_RT.txt'%( view)))  # world2cam matrix
+        normal = img2normal(normal)
+        normal[mask==0] = [0,0,0]
+        mask = mask> (0.5*255)
+        if load_color:
+            all_images.append(image)
+        all_masks.append(mask)
+        all_color_masks.append(color_mask)
+        RT_cv = RT_opengl2opencv(RT)   # convert normal from opengl to opencv
+        all_poses.append(inv_RT(RT_cv))   # cam2world
+        all_w2cs.append(RT_cv)
+        # whether to
+        normal_cam_cv = normal_opengl2opencv(normal)
+        if normal_system == 'front':
+            print("the loaded normals are defined in the system of front view")
+            normal_world = camNormal2worldNormal(inv_RT(RT_front_cv)[:3, :3], normal_cam_cv)
+        elif normal_system == 'self':
+            print("the loaded normals are in their independent camera systems")
+            normal_world = camNormal2worldNormal(inv_RT(RT_cv)[:3, :3], normal_cam_cv)
+        all_normals.append(normal_cam_cv)
+        all_normals_world.append(normal_world)
+        if camera_type == 'ortho':
+            origins, dirs = get_ortho_ray_directions_origins(W=imSize[0], H=imSize[1])
+        elif camera_type == 'pinhole':
+            dirs = get_ray_directions(W=imSize[0], H=imSize[1],
+                                                 fx=cam_params[0], fy=cam_params[1], cx=cam_params[2], cy=cam_params[3])
+            origins = dirs # occupy a position
+        else:
+            raise Exception("not support camera type")
+        ray_origins.append(origins)
+        directions.append(dirs)
+        if not load_color:
+            all_images = [normal2img(x) for x in all_normals_world]
+    return np.stack(all_images), np.stack(all_masks), np.stack(all_normals), \
+        np.stack(all_normals_world), np.stack(all_poses), np.stack(all_w2cs), np.stack(ray_origins), np.stack(directions), np.stack(all_color_masks)
+class OrthoDatasetBase():
+    def setup(self, config, split):
+        self.config = config
+        self.split = split
+        self.rank = get_rank()
+        self.data_dir = self.config.root_dir
+        self.object_name = self.config.scene
+        self.scene = self.config.scene
+        self.imSize = self.config.imSize
+        self.load_color = True
+        self.img_wh = [self.imSize[0], self.imSize[1]]
+        self.w = self.img_wh[0]
+        self.h = self.img_wh[1]
+        self.camera_type = self.config.camera_type
+        self.camera_params = self.config.camera_params  # [fx, fy, cx, cy]
+        self.view_types = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.view_weights = torch.from_numpy(np.array(self.config.view_weights)).float().to(self.rank).view(-1)
+        self.view_weights = self.view_weights.view(-1,1,1).repeat(1, self.h, self.w)
+        if self.config.cam_pose_dir is None:
+            self.cam_pose_dir = "./datasets/fixed_poses"
+        else:
+            self.cam_pose_dir = self.config.cam_pose_dir
+        self.images_np, self.masks_np, self.normals_cam_np, self.normals_world_np, \
+            self.pose_all_np, self.w2c_all_np, self.origins_np, self.directions_np, self.rgb_masks_np = load_a_prediction(
+                self.data_dir, self.object_name, self.imSize, self.view_types,
+                self.load_color, self.cam_pose_dir, normal_system='front',
+                camera_type=self.camera_type, cam_params=self.camera_params)
+        self.has_mask = True
+        self.apply_mask = self.config.apply_mask
+        self.all_c2w = torch.from_numpy(self.pose_all_np)
+        self.all_images = torch.from_numpy(self.images_np) / 255.
+        self.all_fg_masks = torch.from_numpy(self.masks_np)
+        self.all_rgb_masks = torch.from_numpy(self.rgb_masks_np)
+        self.all_normals_world = torch.from_numpy(self.normals_world_np)
+        self.origins = torch.from_numpy(self.origins_np)
+        self.directions = torch.from_numpy(self.directions_np)
+        self.directions = self.directions.float().to(self.rank)
+        self.origins = self.origins.float().to(self.rank)
+        self.all_rgb_masks = self.all_rgb_masks.float().to(self.rank)
+        self.all_c2w, self.all_images, self.all_fg_masks, self.all_normals_world = \
+            self.all_c2w.float().to(self.rank), \
+            self.all_images.float().to(self.rank), \
+            self.all_fg_masks.float().to(self.rank), \
+            self.all_normals_world.float().to(self.rank)
+class OrthoDataset(Dataset, OrthoDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {
+            'index': index
+        }
+class OrthoIterableDataset(IterableDataset, OrthoDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register('ortho')
+class OrthoDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, 'fit']:
+            self.train_dataset = OrthoIterableDataset(self.config, 'train')
+        if stage in [None, 'fit', 'validate']:
+            self.val_dataset = OrthoDataset(self.config, self.config.get('val_split', 'train'))
+        if stage in [None, 'test']:
+            self.test_dataset = OrthoDataset(self.config, self.config.get('test_split', 'test'))
+        if stage in [None, 'predict']:
+            self.predict_dataset = OrthoDataset(self.config, 'train')
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/utils.py ADDED Viewed

File without changes

mesh_recon/datasets/v3d.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+from torchvision.utils import make_grid, save_image
+from einops import rearrange
+from mediapy import read_video
+from pathlib import Path
+from rembg import remove, new_session
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+from datasets.ortho import (
+    inv_RT,
+    camNormal2worldNormal,
+    RT_opengl2opencv,
+    normal_opengl2opencv,
+)
+from utils.dpt import DPT
+def get_c2w_from_up_and_look_at(
+    up,
+    look_at,
+    pos,
+    opengl=False,
+):
+    up = up / np.linalg.norm(up)
+    z = look_at - pos
+    z = z / np.linalg.norm(z)
+    y = -up
+    x = np.cross(y, z)
+    x /= np.linalg.norm(x)
+    y = np.cross(z, x)
+    c2w = np.zeros([4, 4], dtype=np.float32)
+    c2w[:3, 0] = x
+    c2w[:3, 1] = y
+    c2w[:3, 2] = z
+    c2w[:3, 3] = pos
+    c2w[3, 3] = 1.0
+    # opencv to opengl
+    if opengl:
+        c2w[..., 1:3] *= -1
+    return c2w
+def get_uniform_poses(num_frames, radius, elevation, opengl=False):
+    T = num_frames
+    azimuths = np.deg2rad(np.linspace(0, 360, T + 1)[:T])
+    elevations = np.full_like(azimuths, np.deg2rad(elevation))
+    cam_dists = np.full_like(azimuths, radius)
+    campos = np.stack(
+        [
+            cam_dists * np.cos(elevations) * np.cos(azimuths),
+            cam_dists * np.cos(elevations) * np.sin(azimuths),
+            cam_dists * np.sin(elevations),
+        ],
+        axis=-1,
+    )
+    center = np.array([0, 0, 0], dtype=np.float32)
+    up = np.array([0, 0, 1], dtype=np.float32)
+    poses = []
+    for t in range(T):
+        poses.append(get_c2w_from_up_and_look_at(up, center, campos[t], opengl=opengl))
+    return np.stack(poses, axis=0)
+def blender2midas(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+def midas2blender(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+class BlenderDatasetBase:
+    def setup(self, config, split):
+        self.config = config
+        self.rank = get_rank()
+        self.has_mask = True
+        self.apply_mask = True
+        dpt = DPT(device=self.rank, mode="normal")
+        # with open(
+        #     os.path.join(
+        #         self.config.root_dir, self.config.scene, f"transforms_train.json"
+        #     ),
+        #     "r",
+        # ) as f:
+        #     meta = json.load(f)
+        # if "w" in meta and "h" in meta:
+        #     W, H = int(meta["w"]), int(meta["h"])
+        # else:
+        #     W, H = 800, 800
+        frames = read_video(Path(self.config.root_dir) / f"{self.config.scene}")
+        rembg_session = new_session()
+        num_frames, H, W = frames.shape[:3]
+        if "img_wh" in self.config:
+            w, h = self.config.img_wh
+            assert round(W / w * h) == H
+        elif "img_downscale" in self.config:
+            w, h = W // self.config.img_downscale, H // self.config.img_downscale
+        else:
+            raise KeyError("Either img_wh or img_downscale should be specified.")
+        self.w, self.h = w, h
+        self.img_wh = (self.w, self.h)
+        # self.near, self.far = self.config.near_plane, self.config.far_plane
+        self.focal = 0.5 * w / math.tan(0.5 * np.deg2rad(60))  # scaled focal length
+        # ray directions for all pixels, same for all images (same H, W, focal)
+        self.directions = get_ray_directions(
+            self.w, self.h, self.focal, self.focal, self.w // 2, self.h // 2
+        ).to(
+            self.rank
+        )  # (h, w, 3)
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        radius = 2.0
+        elevation = 0.0
+        poses = get_uniform_poses(num_frames, radius, elevation, opengl=True)
+        for i, (c2w, frame) in enumerate(zip(poses, frames)):
+            c2w = torch.from_numpy(np.array(c2w)[:3, :4])
+            self.all_c2w.append(c2w)
+            img = Image.fromarray(frame)
+            img = remove(img, session=rembg_session)
+            img = img.resize(self.img_wh, Image.BICUBIC)
+            img = TF.to_tensor(img).permute(1, 2, 0)  # (4, h, w) => (h, w, 4)
+            self.all_fg_masks.append(img[..., -1])  # (h, w)
+            self.all_images.append(img[..., :3])
+        self.all_c2w, self.all_images, self.all_fg_masks = (
+            torch.stack(self.all_c2w, dim=0).float().to(self.rank),
+            torch.stack(self.all_images, dim=0).float().to(self.rank),
+            torch.stack(self.all_fg_masks, dim=0).float().to(self.rank),
+        )
+        self.normals = dpt(self.all_images)
+        self.all_masks = self.all_fg_masks.cpu().numpy() > 0.1
+        self.normals = self.normals * 2.0 - 1.0
+        self.normals = midas2blender(self.normals).cpu().numpy()
+        # self.normals = self.normals.cpu().numpy()
+        self.normals[..., 0] *= -1
+        self.normals[~self.all_masks] = [0, 0, 0]
+        normals = rearrange(self.normals, "b h w c -> b c h w")
+        normals = normals * 0.5 + 0.5
+        normals = torch.from_numpy(normals)
+        # save_image(make_grid(normals, nrow=4), "tmp/normals.png")
+        # exit(0)
+        (
+            self.all_poses,
+            self.all_normals,
+            self.all_normals_world,
+            self.all_w2cs,
+            self.all_color_masks,
+        ) = ([], [], [], [], [])
+        for c2w_opengl, normal in zip(self.all_c2w.cpu().numpy(), self.normals):
+            RT_opengl = inv_RT(c2w_opengl)
+            RT_opencv = RT_opengl2opencv(RT_opengl)
+            c2w_opencv = inv_RT(RT_opencv)
+            self.all_poses.append(c2w_opencv)
+            self.all_w2cs.append(RT_opencv)
+            normal = normal_opengl2opencv(normal)
+            normal_world = camNormal2worldNormal(inv_RT(RT_opencv)[:3, :3], normal)
+            self.all_normals.append(normal)
+            self.all_normals_world.append(normal_world)
+        self.directions = torch.stack([self.directions] * len(self.all_images))
+        self.origins = self.directions
+        self.all_poses = np.stack(self.all_poses)
+        self.all_normals = np.stack(self.all_normals)
+        self.all_normals_world = np.stack(self.all_normals_world)
+        self.all_w2cs = np.stack(self.all_w2cs)
+        self.all_c2w = torch.from_numpy(self.all_poses).float().to(self.rank)
+        self.all_images = self.all_images.to(self.rank)
+        self.all_fg_masks = self.all_fg_masks.to(self.rank)
+        self.all_rgb_masks = self.all_fg_masks.to(self.rank)
+        self.all_normals_world = (
+            torch.from_numpy(self.all_normals_world).float().to(self.rank)
+        )
+class BlenderDataset(Dataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {"index": index}
+class BlenderIterableDataset(IterableDataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register("v3d")
+class BlenderDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, "fit"]:
+            self.train_dataset = BlenderIterableDataset(
+                self.config, self.config.train_split
+            )
+        if stage in [None, "fit", "validate"]:
+            self.val_dataset = BlenderDataset(self.config, self.config.val_split)
+        if stage in [None, "test"]:
+            self.test_dataset = BlenderDataset(self.config, self.config.test_split)
+        if stage in [None, "predict"]:
+            self.predict_dataset = BlenderDataset(self.config, self.config.train_split)
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler,
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/videonvs.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+from torchvision.utils import make_grid, save_image
+from einops import rearrange
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+from datasets.ortho import (
+    inv_RT,
+    camNormal2worldNormal,
+    RT_opengl2opencv,
+    normal_opengl2opencv,
+)
+from utils.dpt import DPT
+def blender2midas(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+def midas2blender(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+class BlenderDatasetBase:
+    def setup(self, config, split):
+        self.config = config
+        self.rank = get_rank()
+        self.has_mask = True
+        self.apply_mask = True
+        dpt = DPT(device=self.rank, mode="normal")
+        with open(
+            os.path.join(
+                self.config.root_dir, self.config.scene, f"transforms_train.json"
+            ),
+            "r",
+        ) as f:
+            meta = json.load(f)
+        if "w" in meta and "h" in meta:
+            W, H = int(meta["w"]), int(meta["h"])
+        else:
+            W, H = 800, 800
+        if "img_wh" in self.config:
+            w, h = self.config.img_wh
+            assert round(W / w * h) == H
+        elif "img_downscale" in self.config:
+            w, h = W // self.config.img_downscale, H // self.config.img_downscale
+        else:
+            raise KeyError("Either img_wh or img_downscale should be specified.")
+        self.w, self.h = w, h
+        self.img_wh = (self.w, self.h)
+        # self.near, self.far = self.config.near_plane, self.config.far_plane
+        self.focal = (
+            0.5 * w / math.tan(0.5 * meta["camera_angle_x"])
+        )  # scaled focal length
+        # ray directions for all pixels, same for all images (same H, W, focal)
+        self.directions = get_ray_directions(
+            self.w, self.h, self.focal, self.focal, self.w // 2, self.h // 2
+        ).to(
+            self.rank
+        )  # (h, w, 3)
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        for i, frame in enumerate(meta["frames"]):
+            c2w = torch.from_numpy(np.array(frame["transform_matrix"])[:3, :4])
+            self.all_c2w.append(c2w)
+            img_path = os.path.join(
+                self.config.root_dir,
+                self.config.scene,
+                f"{frame['file_path']}.png",
+            )
+            img = Image.open(img_path)
+            img = img.resize(self.img_wh, Image.BICUBIC)
+            img = TF.to_tensor(img).permute(1, 2, 0)  # (4, h, w) => (h, w, 4)
+            self.all_fg_masks.append(img[..., -1])  # (h, w)
+            self.all_images.append(img[..., :3])
+        self.all_c2w, self.all_images, self.all_fg_masks = (
+            torch.stack(self.all_c2w, dim=0).float().to(self.rank),
+            torch.stack(self.all_images, dim=0).float().to(self.rank),
+            torch.stack(self.all_fg_masks, dim=0).float().to(self.rank),
+        )
+        self.normals = dpt(self.all_images)
+        self.all_masks = self.all_fg_masks.cpu().numpy() > 0.1
+        self.normals = self.normals * 2.0 - 1.0
+        self.normals = midas2blender(self.normals).cpu().numpy()
+        # self.normals = self.normals.cpu().numpy()
+        self.normals[..., 0] *= -1
+        self.normals[~self.all_masks] = [0, 0, 0]
+        normals = rearrange(self.normals, "b h w c -> b c h w")
+        normals = normals * 0.5 + 0.5
+        normals = torch.from_numpy(normals)
+        save_image(make_grid(normals, nrow=4), "tmp/normals.png")
+        # exit(0)
+        (
+            self.all_poses,
+            self.all_normals,
+            self.all_normals_world,
+            self.all_w2cs,
+            self.all_color_masks,
+        ) = ([], [], [], [], [])
+        for c2w_opengl, normal in zip(self.all_c2w.cpu().numpy(), self.normals):
+            RT_opengl = inv_RT(c2w_opengl)
+            RT_opencv = RT_opengl2opencv(RT_opengl)
+            c2w_opencv = inv_RT(RT_opencv)
+            self.all_poses.append(c2w_opencv)
+            self.all_w2cs.append(RT_opencv)
+            normal = normal_opengl2opencv(normal)
+            normal_world = camNormal2worldNormal(inv_RT(RT_opencv)[:3, :3], normal)
+            self.all_normals.append(normal)
+            self.all_normals_world.append(normal_world)
+        self.directions = torch.stack([self.directions] * len(self.all_images))
+        self.origins = self.directions
+        self.all_poses = np.stack(self.all_poses)
+        self.all_normals = np.stack(self.all_normals)
+        self.all_normals_world = np.stack(self.all_normals_world)
+        self.all_w2cs = np.stack(self.all_w2cs)
+        self.all_c2w = torch.from_numpy(self.all_poses).float().to(self.rank)
+        self.all_images = self.all_images.to(self.rank)
+        self.all_fg_masks = self.all_fg_masks.to(self.rank)
+        self.all_rgb_masks = self.all_fg_masks.to(self.rank)
+        self.all_normals_world = (
+            torch.from_numpy(self.all_normals_world).float().to(self.rank)
+        )
+        # normals = rearrange(self.all_normals_world, "b h w c -> b c h w")
+        # normals = normals * 0.5 + 0.5
+        # # normals = torch.from_numpy(normals)
+        # save_image(make_grid(normals, nrow=4), "tmp/normals_world.png")
+        # # exit(0)
+        # # normals = (normals + 1) / 2.0
+        # # for debug
+        # index = [0, 9]
+        # self.all_poses = self.all_poses[index]
+        # self.all_c2w = self.all_c2w[index]
+        # self.all_normals_world = self.all_normals_world[index]
+        # self.all_w2cs = self.all_w2cs[index]
+        # self.rgb_masks = self.all_rgb_masks[index]
+        # self.fg_masks = self.all_fg_masks[index]
+        # self.all_images = self.all_images[index]
+        # self.directions = self.directions[index]
+        # self.origins = self.origins[index]
+        # images = rearrange(self.all_images, "b h w c -> b c h w")
+        # normals = rearrange(normals, "b h w c -> b c h w")
+        # save_image(make_grid(images, nrow=4), "tmp/images.png")
+        # save_image(make_grid(normals, nrow=4), "tmp/normals.png")
+        # breakpoint()
+        # self.normals = self.normals * 2.0 - 1.0
+class BlenderDataset(Dataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {"index": index}
+class BlenderIterableDataset(IterableDataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register("videonvs")
+class BlenderDataModule(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, "fit"]:
+            self.train_dataset = BlenderIterableDataset(
+                self.config, self.config.train_split
+            )
+        if stage in [None, "fit", "validate"]:
+            self.val_dataset = BlenderDataset(self.config, self.config.val_split)
+        if stage in [None, "test"]:
+            self.test_dataset = BlenderDataset(self.config, self.config.test_split)
+        if stage in [None, "predict"]:
+            self.predict_dataset = BlenderDataset(self.config, self.config.train_split)
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler,
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/datasets/videonvs_co3d.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import os
+import json
+import math
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import torchvision.transforms.functional as TF
+from torchvision.utils import make_grid, save_image
+from einops import rearrange
+from rembg import remove, new_session
+import pytorch_lightning as pl
+import datasets
+from models.ray_utils import get_ray_directions
+from utils.misc import get_rank
+from datasets.ortho import (
+    inv_RT,
+    camNormal2worldNormal,
+    RT_opengl2opencv,
+    normal_opengl2opencv,
+)
+from utils.dpt import DPT
+def blender2midas(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+def midas2blender(img):
+    """Blender: rub
+    midas: lub
+    """
+    img[..., 0] = -img[..., 0]
+    img[..., 1] = -img[..., 1]
+    img[..., -1] = -img[..., -1]
+    return img
+class BlenderDatasetBase:
+    def setup(self, config, split):
+        self.config = config
+        self.rank = get_rank()
+        self.has_mask = True
+        self.apply_mask = True
+        dpt = DPT(device=self.rank, mode="normal")
+        self.directions = []
+        with open(
+            os.path.join(self.config.root_dir, self.config.scene, f"transforms.json"),
+            "r",
+        ) as f:
+            meta = json.load(f)
+        if "w" in meta and "h" in meta:
+            W, H = int(meta["w"]), int(meta["h"])
+        else:
+            W, H = 800, 800
+        if "img_wh" in self.config:
+            w, h = self.config.img_wh
+            assert round(W / w * h) == H
+        elif "img_downscale" in self.config:
+            w, h = W // self.config.img_downscale, H // self.config.img_downscale
+        else:
+            raise KeyError("Either img_wh or img_downscale should be specified.")
+        self.w, self.h = w, h
+        self.img_wh = (self.w, self.h)
+        # self.near, self.far = self.config.near_plane, self.config.far_plane
+        _session = new_session()
+        self.all_c2w, self.all_images, self.all_fg_masks = [], [], []
+        for i, frame in enumerate(meta["frames"]):
+            c2w = torch.from_numpy(np.array(frame["transform_matrix"])[:3, :4])
+            self.all_c2w.append(c2w)
+            img_path = os.path.join(
+                self.config.root_dir,
+                self.config.scene,
+                f"{frame['file_path']}",
+            )
+            img = Image.open(img_path)
+            img = remove(img, session=_session)
+            img = img.resize(self.img_wh, Image.BICUBIC)
+            img = TF.to_tensor(img).permute(1, 2, 0)  # (4, h, w) => (h, w, 4)
+            fx = frame["fl_x"]
+            fy = frame["fl_y"]
+            cx = frame["cx"]
+            cy = frame["cy"]
+            self.all_fg_masks.append(img[..., -1])  # (h, w)
+            self.all_images.append(img[..., :3])
+            self.directions.append(get_ray_directions(self.w, self.h, fx, fy, cx, cy))
+        self.all_c2w, self.all_images, self.all_fg_masks = (
+            torch.stack(self.all_c2w, dim=0).float().to(self.rank),
+            torch.stack(self.all_images, dim=0).float().to(self.rank),
+            torch.stack(self.all_fg_masks, dim=0).float().to(self.rank),
+        )
+        self.normals = dpt(self.all_images)
+        self.all_masks = self.all_fg_masks.cpu().numpy() > 0.1
+        self.normals = self.normals * 2.0 - 1.0
+        self.normals = midas2blender(self.normals).cpu().numpy()
+        # self.normals = self.normals.cpu().numpy()
+        self.normals[..., 0] *= -1
+        self.normals[~self.all_masks] = [0, 0, 0]
+        normals = rearrange(self.normals, "b h w c -> b c h w")
+        normals = normals * 0.5 + 0.5
+        normals = torch.from_numpy(normals)
+        save_image(make_grid(normals, nrow=4), "tmp/normals.png")
+        # exit(0)
+        (
+            self.all_poses,
+            self.all_normals,
+            self.all_normals_world,
+            self.all_w2cs,
+            self.all_color_masks,
+        ) = ([], [], [], [], [])
+        for c2w_opengl, normal in zip(self.all_c2w.cpu().numpy(), self.normals):
+            RT_opengl = inv_RT(c2w_opengl)
+            RT_opencv = RT_opengl2opencv(RT_opengl)
+            c2w_opencv = inv_RT(RT_opencv)
+            self.all_poses.append(c2w_opencv)
+            self.all_w2cs.append(RT_opencv)
+            normal = normal_opengl2opencv(normal)
+            normal_world = camNormal2worldNormal(inv_RT(RT_opencv)[:3, :3], normal)
+            self.all_normals.append(normal)
+            self.all_normals_world.append(normal_world)
+        self.directions = torch.stack(self.directions).to(self.rank)
+        self.origins = self.directions
+        self.all_poses = np.stack(self.all_poses)
+        self.all_normals = np.stack(self.all_normals)
+        self.all_normals_world = np.stack(self.all_normals_world)
+        self.all_w2cs = np.stack(self.all_w2cs)
+        self.all_c2w = torch.from_numpy(self.all_poses).float().to(self.rank)
+        self.all_images = self.all_images.to(self.rank)
+        self.all_fg_masks = self.all_fg_masks.to(self.rank)
+        self.all_rgb_masks = self.all_fg_masks.to(self.rank)
+        self.all_normals_world = (
+            torch.from_numpy(self.all_normals_world).float().to(self.rank)
+        )
+        # normals = rearrange(self.all_normals_world, "b h w c -> b c h w")
+        # normals = normals * 0.5 + 0.5
+        # # normals = torch.from_numpy(normals)
+        # save_image(make_grid(normals, nrow=4), "tmp/normals_world.png")
+        # # exit(0)
+        # # normals = (normals + 1) / 2.0
+        # # for debug
+        # index = [0, 9]
+        # self.all_poses = self.all_poses[index]
+        # self.all_c2w = self.all_c2w[index]
+        # self.all_normals_world = self.all_normals_world[index]
+        # self.all_w2cs = self.all_w2cs[index]
+        # self.rgb_masks = self.all_rgb_masks[index]
+        # self.fg_masks = self.all_fg_masks[index]
+        # self.all_images = self.all_images[index]
+        # self.directions = self.directions[index]
+        # self.origins = self.origins[index]
+        # images = rearrange(self.all_images, "b h w c -> b c h w")
+        # normals = rearrange(normals, "b h w c -> b c h w")
+        # save_image(make_grid(images, nrow=4), "tmp/images.png")
+        # save_image(make_grid(normals, nrow=4), "tmp/normals.png")
+        # breakpoint()
+        # self.normals = self.normals * 2.0 - 1.0
+class BlenderDataset(Dataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        return {"index": index}
+class BlenderIterableDataset(IterableDataset, BlenderDatasetBase):
+    def __init__(self, config, split):
+        self.setup(config, split)
+    def __iter__(self):
+        while True:
+            yield {}
+@datasets.register("videonvs-scene")
+class VideoNVSScene(pl.LightningDataModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+    def setup(self, stage=None):
+        if stage in [None, "fit"]:
+            self.train_dataset = BlenderIterableDataset(
+                self.config, self.config.train_split
+            )
+        if stage in [None, "fit", "validate"]:
+            self.val_dataset = BlenderDataset(self.config, self.config.val_split)
+        if stage in [None, "test"]:
+            self.test_dataset = BlenderDataset(self.config, self.config.test_split)
+        if stage in [None, "predict"]:
+            self.predict_dataset = BlenderDataset(self.config, self.config.train_split)
+    def prepare_data(self):
+        pass
+    def general_loader(self, dataset, batch_size):
+        sampler = None
+        return DataLoader(
+            dataset,
+            num_workers=os.cpu_count(),
+            batch_size=batch_size,
+            pin_memory=True,
+            sampler=sampler,
+        )
+    def train_dataloader(self):
+        return self.general_loader(self.train_dataset, batch_size=1)
+    def val_dataloader(self):
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self):
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self):
+        return self.general_loader(self.predict_dataset, batch_size=1)

mesh_recon/launch.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import sys
+import argparse
+import os
+import time
+import logging
+from datetime import datetime
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to config file")
+    parser.add_argument("--gpu", default="0", help="GPU(s) to be used")
+    parser.add_argument(
+        "--resume", default=None, help="path to the weights to be resumed"
+    )
+    parser.add_argument(
+        "--resume_weights_only",
+        action="store_true",
+        help="specify this argument to restore only the weights (w/o training states), e.g. --resume path/to/resume --resume_weights_only",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--train", action="store_true")
+    group.add_argument("--validate", action="store_true")
+    group.add_argument("--test", action="store_true")
+    group.add_argument("--predict", action="store_true")
+    # group.add_argument('--export', action='store_true') # TODO: a separate export action
+    parser.add_argument("--exp_dir", default="./exp")
+    parser.add_argument("--runs_dir", default="./runs")
+    parser.add_argument(
+        "--verbose", action="store_true", help="if true, set logging level to DEBUG"
+    )
+    args, extras = parser.parse_known_args()
+    # set CUDA_VISIBLE_DEVICES then import pytorch-lightning
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    n_gpus = len(args.gpu.split(","))
+    import datasets
+    import systems
+    import pytorch_lightning as pl
+    from pytorch_lightning import Trainer
+    from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+    from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
+    from utils.callbacks import (
+        CodeSnapshotCallback,
+        ConfigSnapshotCallback,
+        CustomProgressBar,
+    )
+    from utils.misc import load_config
+    # parse YAML config to OmegaConf
+    config = load_config(args.config, cli_args=extras)
+    config.cmd_args = vars(args)
+    config.trial_name = config.get("trial_name") or (
+        config.tag + datetime.now().strftime("@%Y%m%d-%H%M%S")
+    )
+    config.exp_dir = config.get("exp_dir") or os.path.join(args.exp_dir, config.name)
+    config.save_dir = config.get("save_dir") or os.path.join(
+        config.exp_dir, config.trial_name, "save"
+    )
+    config.ckpt_dir = config.get("ckpt_dir") or os.path.join(
+        config.exp_dir, config.trial_name, "ckpt"
+    )
+    config.code_dir = config.get("code_dir") or os.path.join(
+        config.exp_dir, config.trial_name, "code"
+    )
+    config.config_dir = config.get("config_dir") or os.path.join(
+        config.exp_dir, config.trial_name, "config"
+    )
+    logger = logging.getLogger("pytorch_lightning")
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+    if "seed" not in config:
+        config.seed = int(time.time() * 1000) % 1000
+    pl.seed_everything(config.seed)
+    dm = datasets.make(config.dataset.name, config.dataset)
+    system = systems.make(
+        config.system.name,
+        config,
+        load_from_checkpoint=None if not args.resume_weights_only else args.resume,
+    )
+    callbacks = []
+    if args.train:
+        callbacks += [
+            ModelCheckpoint(dirpath=config.ckpt_dir, **config.checkpoint),
+            LearningRateMonitor(logging_interval="step"),
+            # CodeSnapshotCallback(
+            #     config.code_dir, use_version=False
+            # ),
+            ConfigSnapshotCallback(config, config.config_dir, use_version=False),
+            CustomProgressBar(refresh_rate=1),
+        ]
+    loggers = []
+    if args.train:
+        loggers += [
+            TensorBoardLogger(
+                args.runs_dir, name=config.name, version=config.trial_name
+            ),
+            CSVLogger(config.exp_dir, name=config.trial_name, version="csv_logs"),
+        ]
+    if sys.platform == "win32":
+        # does not support multi-gpu on windows
+        strategy = "dp"
+        assert n_gpus == 1
+    else:
+        strategy = "ddp_find_unused_parameters_false"
+    trainer = Trainer(
+        devices=n_gpus,
+        accelerator="gpu",
+        callbacks=callbacks,
+        logger=loggers,
+        strategy=strategy,
+        **config.trainer
+    )
+    if args.train:
+        if args.resume and not args.resume_weights_only:
+            # FIXME: different behavior in pytorch-lighting>1.9 ?
+            trainer.fit(system, datamodule=dm, ckpt_path=args.resume)
+        else:
+            trainer.fit(system, datamodule=dm)
+        trainer.test(system, datamodule=dm)
+    elif args.validate:
+        trainer.validate(system, datamodule=dm, ckpt_path=args.resume)
+    elif args.test:
+        trainer.test(system, datamodule=dm, ckpt_path=args.resume)
+    elif args.predict:
+        trainer.predict(system, datamodule=dm, ckpt_path=args.resume)
+if __name__ == "__main__":
+    main()

mesh_recon/mesh.py ADDED Viewed

	@@ -0,0 +1,845 @@

+import os
+import cv2
+import torch
+import trimesh
+import numpy as np
+from kiui.op import safe_normalize, dot
+from kiui.typing import *
+class Mesh:
+    """
+    A torch-native trimesh class, with support for ``ply/obj/glb`` formats.
+    Note:
+        This class only supports one mesh with a single texture image (an albedo texture and a metallic-roughness texture).
+    """
+    def __init__(
+        self,
+        v: Optional[Tensor] = None,
+        f: Optional[Tensor] = None,
+        vn: Optional[Tensor] = None,
+        fn: Optional[Tensor] = None,
+        vt: Optional[Tensor] = None,
+        ft: Optional[Tensor] = None,
+        vc: Optional[Tensor] = None, # vertex color
+        albedo: Optional[Tensor] = None,
+        metallicRoughness: Optional[Tensor] = None,
+        device: Optional[torch.device] = None,
+    ):
+        """Init a mesh directly using all attributes.
+        Args:
+            v (Optional[Tensor]): vertices, float [N, 3]. Defaults to None.
+            f (Optional[Tensor]): faces, int [M, 3]. Defaults to None.
+            vn (Optional[Tensor]): vertex normals, float [N, 3]. Defaults to None.
+            fn (Optional[Tensor]): faces for normals, int [M, 3]. Defaults to None.
+            vt (Optional[Tensor]): vertex uv coordinates, float [N, 2]. Defaults to None.
+            ft (Optional[Tensor]): faces for uvs, int [M, 3]. Defaults to None.
+            vc (Optional[Tensor]): vertex colors, float [N, 3]. Defaults to None.
+            albedo (Optional[Tensor]): albedo texture, float [H, W, 3], RGB format. Defaults to None.
+            metallicRoughness (Optional[Tensor]): metallic-roughness texture, float [H, W, 3], metallic(Blue) = metallicRoughness[..., 2], roughness(Green) = metallicRoughness[..., 1]. Defaults to None.
+            device (Optional[torch.device]): torch device. Defaults to None.
+        """
+        self.device = device
+        self.v = v
+        self.vn = vn
+        self.vt = vt
+        self.f = f
+        self.fn = fn
+        self.ft = ft
+        # will first see if there is vertex color to use
+        self.vc = vc
+        # only support a single albedo image
+        self.albedo = albedo
+        # pbr extension, metallic(Blue) = metallicRoughness[..., 2], roughness(Green) = metallicRoughness[..., 1]
+        # ref: https://registry.khronos.org/glTF/specs/2.0/glTF-2.0.html
+        self.metallicRoughness = metallicRoughness
+        self.ori_center = 0
+        self.ori_scale = 1
+    @classmethod
+    def load(cls, path, resize=True, clean=False, renormal=True, retex=False, bound=0.9, front_dir='+z', **kwargs):
+        """load mesh from path.
+        Args:
+            path (str): path to mesh file, supports ply, obj, glb.
+            clean (bool, optional): perform mesh cleaning at load (e.g., merge close vertices). Defaults to False.
+            resize (bool, optional): auto resize the mesh using ``bound`` into [-bound, bound]^3. Defaults to True.
+            renormal (bool, optional): re-calc the vertex normals. Defaults to True.
+            retex (bool, optional): re-calc the uv coordinates, will overwrite the existing uv coordinates. Defaults to False.
+            bound (float, optional): bound to resize. Defaults to 0.9.
+            front_dir (str, optional): front-view direction of the mesh, should be [+-][xyz][ 123]. Defaults to '+z'.
+            device (torch.device, optional): torch device. Defaults to None.
+        Note:
+            a ``device`` keyword argument can be provided to specify the torch device.
+            If it's not provided, we will try to use ``'cuda'`` as the device if it's available.
+        Returns:
+            Mesh: the loaded Mesh object.
+        """
+        # obj supports face uv
+        if path.endswith(".obj"):
+            mesh = cls.load_obj(path, **kwargs)
+        # trimesh only supports vertex uv, but can load more formats
+        else:
+            mesh = cls.load_trimesh(path, **kwargs)
+        # clean
+        if clean:
+            from kiui.mesh_utils import clean_mesh
+            vertices = mesh.v.detach().cpu().numpy()
+            triangles = mesh.f.detach().cpu().numpy()
+            vertices, triangles = clean_mesh(vertices, triangles, remesh=False)
+            mesh.v = torch.from_numpy(vertices).contiguous().float().to(mesh.device)
+            mesh.f = torch.from_numpy(triangles).contiguous().int().to(mesh.device)
+        print(f"[Mesh loading] v: {mesh.v.shape}, f: {mesh.f.shape}")
+        # auto-normalize
+        if resize:
+            mesh.auto_size(bound=bound)
+        # auto-fix normal
+        if renormal or mesh.vn is None:
+            mesh.auto_normal()
+            print(f"[Mesh loading] vn: {mesh.vn.shape}, fn: {mesh.fn.shape}")
+        # auto-fix texcoords
+        if retex or (mesh.albedo is not None and mesh.vt is None):
+            mesh.auto_uv(cache_path=path)
+            print(f"[Mesh loading] vt: {mesh.vt.shape}, ft: {mesh.ft.shape}")
+        # rotate front dir to +z
+        if front_dir != "+z":
+            # axis switch
+            if "-z" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, -1]], device=mesh.device, dtype=torch.float32)
+            elif "+x" in front_dir:
+                T = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-x" in front_dir:
+                T = torch.tensor([[0, 0, -1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "+y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, 1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            else:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            # rotation (how many 90 degrees)
+            if '1' in front_dir:
+                T @= torch.tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '2' in front_dir:
+                T @= torch.tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '3' in front_dir:
+                T @= torch.tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            mesh.v @= T
+            mesh.vn @= T
+        return mesh
+    # load from obj file
+    @classmethod
+    def load_obj(cls, path, albedo_path=None, device=None):
+        """load an ``obj`` mesh.
+        Args:
+            path (str): path to mesh.
+            albedo_path (str, optional): path to the albedo texture image, will overwrite the existing texture path if specified in mtl. Defaults to None.
+            device (torch.device, optional): torch device. Defaults to None.
+        Note:
+            We will try to read `mtl` path from `obj`, else we assume the file name is the same as `obj` but with `mtl` extension.
+            The `usemtl` statement is ignored, and we only use the last material path in `mtl` file.
+        Returns:
+            Mesh: the loaded Mesh object.
+        """
+        assert os.path.splitext(path)[-1] == ".obj"
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # load obj
+        with open(path, "r") as f:
+            lines = f.readlines()
+        def parse_f_v(fv):
+            # pass in a vertex term of a face, return {v, vt, vn} (-1 if not provided)
+            # supported forms:
+            # f v1 v2 v3
+            # f v1/vt1 v2/vt2 v3/vt3
+            # f v1/vt1/vn1 v2/vt2/vn2 v3/vt3/vn3
+            # f v1//vn1 v2//vn2 v3//vn3
+            xs = [int(x) - 1 if x != "" else -1 for x in fv.split("/")]
+            xs.extend([-1] * (3 - len(xs)))
+            return xs[0], xs[1], xs[2]
+        vertices, texcoords, normals = [], [], []
+        faces, tfaces, nfaces = [], [], []
+        mtl_path = None
+        for line in lines:
+            split_line = line.split()
+            # empty line
+            if len(split_line) == 0:
+                continue
+            prefix = split_line[0].lower()
+            # mtllib
+            if prefix == "mtllib":
+                mtl_path = split_line[1]
+            # usemtl
+            elif prefix == "usemtl":
+                pass # ignored
+            # v/vn/vt
+            elif prefix == "v":
+                vertices.append([float(v) for v in split_line[1:]])
+            elif prefix == "vn":
+                normals.append([float(v) for v in split_line[1:]])
+            elif prefix == "vt":
+                val = [float(v) for v in split_line[1:]]
+                texcoords.append([val[0], 1.0 - val[1]])
+            elif prefix == "f":
+                vs = split_line[1:]
+                nv = len(vs)
+                v0, t0, n0 = parse_f_v(vs[0])
+                for i in range(nv - 2):  # triangulate (assume vertices are ordered)
+                    v1, t1, n1 = parse_f_v(vs[i + 1])
+                    v2, t2, n2 = parse_f_v(vs[i + 2])
+                    faces.append([v0, v1, v2])
+                    tfaces.append([t0, t1, t2])
+                    nfaces.append([n0, n1, n2])
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        # see if there is vertex color
+        use_vertex_color = False
+        if mesh.v.shape[1] == 6:
+            use_vertex_color = True
+            mesh.vc = mesh.v[:, 3:]
+            mesh.v = mesh.v[:, :3]
+            print(f"[load_obj] use vertex color: {mesh.vc.shape}")
+        # try to load texture image
+        if not use_vertex_color:
+            # try to retrieve mtl file
+            mtl_path_candidates = []
+            if mtl_path is not None:
+                mtl_path_candidates.append(mtl_path)
+                mtl_path_candidates.append(os.path.join(os.path.dirname(path), mtl_path))
+            mtl_path_candidates.append(path.replace(".obj", ".mtl"))
+            mtl_path = None
+            for candidate in mtl_path_candidates:
+                if os.path.exists(candidate):
+                    mtl_path = candidate
+                    break
+            # if albedo_path is not provided, try retrieve it from mtl
+            metallic_path = None
+            roughness_path = None
+            if mtl_path is not None and albedo_path is None:
+                with open(mtl_path, "r") as f:
+                    lines = f.readlines()
+                for line in lines:
+                    split_line = line.split()
+                    # empty line
+                    if len(split_line) == 0:
+                        continue
+                    prefix = split_line[0]
+                    if "map_Kd" in prefix:
+                        # assume relative path!
+                        albedo_path = os.path.join(os.path.dirname(path), split_line[1])
+                        print(f"[load_obj] use texture from: {albedo_path}")
+                    elif "map_Pm" in prefix:
+                        metallic_path = os.path.join(os.path.dirname(path), split_line[1])
+                    elif "map_Pr" in prefix:
+                        roughness_path = os.path.join(os.path.dirname(path), split_line[1])
+            # still not found albedo_path, or the path doesn't exist
+            if albedo_path is None or not os.path.exists(albedo_path):
+                # init an empty texture
+                print(f"[load_obj] init empty albedo!")
+                # albedo = np.random.rand(1024, 1024, 3).astype(np.float32)
+                albedo = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])  # default color
+            else:
+                albedo = cv2.imread(albedo_path, cv2.IMREAD_UNCHANGED)
+                albedo = cv2.cvtColor(albedo, cv2.COLOR_BGR2RGB)
+                albedo = albedo.astype(np.float32) / 255
+                print(f"[load_obj] load texture: {albedo.shape}")
+            mesh.albedo = torch.tensor(albedo, dtype=torch.float32, device=device)
+            # try to load metallic and roughness
+            if metallic_path is not None and roughness_path is not None:
+                print(f"[load_obj] load metallicRoughness from: {metallic_path}, {roughness_path}")
+                metallic = cv2.imread(metallic_path, cv2.IMREAD_UNCHANGED)
+                metallic = metallic.astype(np.float32) / 255
+                roughness = cv2.imread(roughness_path, cv2.IMREAD_UNCHANGED)
+                roughness = roughness.astype(np.float32) / 255
+                metallicRoughness = np.stack([np.zeros_like(metallic), roughness, metallic], axis=-1)
+                mesh.metallicRoughness = torch.tensor(metallicRoughness, dtype=torch.float32, device=device).contiguous()
+        return mesh
+    @classmethod
+    def load_trimesh(cls, path, device=None):
+        """load a mesh using ``trimesh.load()``.
+        Can load various formats like ``glb`` and serves as a fallback.
+        Note:
+            We will try to merge all meshes if the glb contains more than one,
+            but **this may cause the texture to lose**, since we only support one texture image!
+        Args:
+            path (str): path to the mesh file.
+            device (torch.device, optional): torch device. Defaults to None.
+        Returns:
+            Mesh: the loaded Mesh object.
+        """
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # use trimesh to load ply/glb
+        _data = trimesh.load(path)
+        if isinstance(_data, trimesh.Scene):
+            if len(_data.geometry) == 1:
+                _mesh = list(_data.geometry.values())[0]
+            else:
+                print(f"[load_trimesh] concatenating {len(_data.geometry)} meshes.")
+                _concat = []
+                # loop the scene graph and apply transform to each mesh
+                scene_graph = _data.graph.to_flattened() # dict {name: {transform: 4x4 mat, geometry: str}}
+                for k, v in scene_graph.items():
+                    name = v['geometry']
+                    if name in _data.geometry and isinstance(_data.geometry[name], trimesh.Trimesh):
+                        transform = v['transform']
+                        _concat.append(_data.geometry[name].apply_transform(transform))
+                _mesh = trimesh.util.concatenate(_concat)
+        else:
+            _mesh = _data
+        if _mesh.visual.kind == 'vertex':
+            vertex_colors = _mesh.visual.vertex_colors
+            vertex_colors = np.array(vertex_colors[..., :3]).astype(np.float32) / 255
+            mesh.vc = torch.tensor(vertex_colors, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] use vertex color: {mesh.vc.shape}")
+        elif _mesh.visual.kind == 'texture':
+            _material = _mesh.visual.material
+            if isinstance(_material, trimesh.visual.material.PBRMaterial):
+                texture = np.array(_material.baseColorTexture).astype(np.float32) / 255
+                # load metallicRoughness if present
+                if _material.metallicRoughnessTexture is not None:
+                    metallicRoughness = np.array(_material.metallicRoughnessTexture).astype(np.float32) / 255
+                    mesh.metallicRoughness = torch.tensor(metallicRoughness, dtype=torch.float32, device=device).contiguous()
+            elif isinstance(_material, trimesh.visual.material.SimpleMaterial):
+                texture = np.array(_material.to_pbr().baseColorTexture).astype(np.float32) / 255
+            else:
+                raise NotImplementedError(f"material type {type(_material)} not supported!")
+            mesh.albedo = torch.tensor(texture[..., :3], dtype=torch.float32, device=device).contiguous()
+            print(f"[load_trimesh] load texture: {texture.shape}")
+        else:
+            texture = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])
+            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] failed to load texture.")
+        vertices = _mesh.vertices
+        try:
+            texcoords = _mesh.visual.uv
+            texcoords[:, 1] = 1 - texcoords[:, 1]
+        except Exception as e:
+            texcoords = None
+        try:
+            normals = _mesh.vertex_normals
+        except Exception as e:
+            normals = None
+        # trimesh only support vertex uv...
+        faces = tfaces = nfaces = _mesh.faces
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if normals is not None
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if normals is not None
+            else None
+        )
+        return mesh
+    # sample surface (using trimesh)
+    def sample_surface(self, count: int):
+        """sample points on the surface of the mesh.
+        Args:
+            count (int): number of points to sample.
+        Returns:
+            torch.Tensor: the sampled points, float [count, 3].
+        """
+        _mesh = trimesh.Trimesh(vertices=self.v.detach().cpu().numpy(), faces=self.f.detach().cpu().numpy())
+        points, face_idx = trimesh.sample.sample_surface(_mesh, count)
+        points = torch.from_numpy(points).float().to(self.device)
+        return points
+    # aabb
+    def aabb(self):
+        """get the axis-aligned bounding box of the mesh.
+        Returns:
+            Tuple[torch.Tensor]: the min xyz and max xyz of the mesh.
+        """
+        return torch.min(self.v, dim=0).values, torch.max(self.v, dim=0).values
+    # unit size
+    @torch.no_grad()
+    def auto_size(self, bound=0.9):
+        """auto resize the mesh.
+        Args:
+            bound (float, optional): resizing into ``[-bound, bound]^3``. Defaults to 0.9.
+        """
+        vmin, vmax = self.aabb()
+        self.ori_center = (vmax + vmin) / 2
+        self.ori_scale = 2 * bound / torch.max(vmax - vmin).item()
+        self.v = (self.v - self.ori_center) * self.ori_scale
+    def auto_normal(self):
+        """auto calculate the vertex normals.
+        """
+        i0, i1, i2 = self.f[:, 0].long(), self.f[:, 1].long(), self.f[:, 2].long()
+        v0, v1, v2 = self.v[i0, :], self.v[i1, :], self.v[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0)
+        # Splat face normals to vertices
+        vn = torch.zeros_like(self.v)
+        vn.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        vn = torch.where(
+            dot(vn, vn) > 1e-20,
+            vn,
+            torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device),
+        )
+        vn = safe_normalize(vn)
+        self.vn = vn
+        self.fn = self.f
+    def auto_uv(self, cache_path=None, vmap=True):
+        """auto calculate the uv coordinates.
+        Args:
+            cache_path (str, optional): path to save/load the uv cache as a npz file, this can avoid calculating uv every time when loading the same mesh, which is time-consuming. Defaults to None.
+            vmap (bool, optional): remap vertices based on uv coordinates, so each v correspond to a unique vt (necessary for formats like gltf).
+                Usually this will duplicate the vertices on the edge of uv atlas. Defaults to True.
+        """
+        # try to load cache
+        if cache_path is not None:
+            cache_path = os.path.splitext(cache_path)[0] + "_uv.npz"
+        if cache_path is not None and os.path.exists(cache_path):
+            data = np.load(cache_path)
+            vt_np, ft_np, vmapping = data["vt"], data["ft"], data["vmapping"]
+        else:
+            import xatlas
+            v_np = self.v.detach().cpu().numpy()
+            f_np = self.f.detach().int().cpu().numpy()
+            atlas = xatlas.Atlas()
+            atlas.add_mesh(v_np, f_np)
+            chart_options = xatlas.ChartOptions()
+            # chart_options.max_iterations = 4
+            atlas.generate(chart_options=chart_options)
+            vmapping, ft_np, vt_np = atlas[0]  # [N], [M, 3], [N, 2]
+            # save to cache
+            if cache_path is not None:
+                np.savez(cache_path, vt=vt_np, ft=ft_np, vmapping=vmapping)
+        vt = torch.from_numpy(vt_np.astype(np.float32)).to(self.device)
+        ft = torch.from_numpy(ft_np.astype(np.int32)).to(self.device)
+        self.vt = vt
+        self.ft = ft
+        if vmap:
+            vmapping = torch.from_numpy(vmapping.astype(np.int64)).long().to(self.device)
+            self.align_v_to_vt(vmapping)
+    def align_v_to_vt(self, vmapping=None):
+        """ remap v/f and vn/fn to vt/ft.
+        Args:
+            vmapping (np.ndarray, optional): the mapping relationship from f to ft. Defaults to None.
+        """
+        if vmapping is None:
+            ft = self.ft.view(-1).long()
+            f = self.f.view(-1).long()
+            vmapping = torch.zeros(self.vt.shape[0], dtype=torch.long, device=self.device)
+            vmapping[ft] = f # scatter, randomly choose one if index is not unique
+        self.v = self.v[vmapping]
+        self.f = self.ft
+        if self.vn is not None:
+            self.vn = self.vn[vmapping]
+            self.fn = self.ft
+    def to(self, device):
+        """move all tensor attributes to device.
+        Args:
+            device (torch.device): target device.
+        Returns:
+            Mesh: self.
+        """
+        self.device = device
+        for name in ["v", "f", "vn", "fn", "vt", "ft", "albedo", "vc", "metallicRoughness"]:
+            tensor = getattr(self, name)
+            if tensor is not None:
+                setattr(self, name, tensor.to(device))
+        return self
+    def write(self, path):
+        """write the mesh to a path.
+        Args:
+            path (str): path to write, supports ply, obj and glb.
+        """
+        if path.endswith(".ply"):
+            self.write_ply(path)
+        elif path.endswith(".obj"):
+            self.write_obj(path)
+        elif path.endswith(".glb") or path.endswith(".gltf"):
+            self.write_glb(path)
+        else:
+            raise NotImplementedError(f"format {path} not supported!")
+    def write_ply(self, path):
+        """write the mesh in ply format. Only for geometry!
+        Args:
+            path (str): path to write.
+        """
+        if self.albedo is not None:
+            print(f'[WARN] ply format does not support exporting texture, will ignore!')
+        v_np = self.v.detach().cpu().numpy()
+        f_np = self.f.detach().cpu().numpy()
+        _mesh = trimesh.Trimesh(vertices=v_np, faces=f_np)
+        _mesh.export(path)
+    def write_glb(self, path):
+        """write the mesh in glb/gltf format.
+          This will create a scene with a single mesh.
+        Args:
+            path (str): path to write.
+        """
+        # assert self.v.shape[0] == self.vn.shape[0] and self.v.shape[0] == self.vt.shape[0]
+        if self.vt is not None and self.v.shape[0] != self.vt.shape[0]:
+            self.align_v_to_vt()
+        import pygltflib
+        f_np = self.f.detach().cpu().numpy().astype(np.uint32)
+        f_np_blob = f_np.flatten().tobytes()
+        v_np = self.v.detach().cpu().numpy().astype(np.float32)
+        v_np_blob = v_np.tobytes()
+        blob = f_np_blob + v_np_blob
+        byteOffset = len(blob)
+        # base mesh
+        gltf = pygltflib.GLTF2(
+            scene=0,
+            scenes=[pygltflib.Scene(nodes=[0])],
+            nodes=[pygltflib.Node(mesh=0)],
+            meshes=[pygltflib.Mesh(primitives=[pygltflib.Primitive(
+                # indices to accessors (0 is triangles)
+                attributes=pygltflib.Attributes(
+                    POSITION=1,
+                ),
+                indices=0,
+            )])],
+            buffers=[
+                pygltflib.Buffer(byteLength=len(f_np_blob) + len(v_np_blob))
+            ],
+            # buffer view (based on dtype)
+            bufferViews=[
+                # triangles; as flatten (element) array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteLength=len(f_np_blob),
+                    target=pygltflib.ELEMENT_ARRAY_BUFFER, # GL_ELEMENT_ARRAY_BUFFER (34963)
+                ),
+                # positions; as vec3 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob),
+                    byteLength=len(v_np_blob),
+                    byteStride=12, # vec3
+                    target=pygltflib.ARRAY_BUFFER, # GL_ARRAY_BUFFER (34962)
+                ),
+            ],
+            accessors=[
+                # 0 = triangles
+                pygltflib.Accessor(
+                    bufferView=0,
+                    componentType=pygltflib.UNSIGNED_INT, # GL_UNSIGNED_INT (5125)
+                    count=f_np.size,
+                    type=pygltflib.SCALAR,
+                    max=[int(f_np.max())],
+                    min=[int(f_np.min())],
+                ),
+                # 1 = positions
+                pygltflib.Accessor(
+                    bufferView=1,
+                    componentType=pygltflib.FLOAT, # GL_FLOAT (5126)
+                    count=len(v_np),
+                    type=pygltflib.VEC3,
+                    max=v_np.max(axis=0).tolist(),
+                    min=v_np.min(axis=0).tolist(),
+                ),
+            ],
+        )
+        # append texture info
+        if self.vt is not None:
+            vt_np = self.vt.detach().cpu().numpy().astype(np.float32)
+            vt_np_blob = vt_np.tobytes()
+            albedo = self.albedo.detach().cpu().numpy()
+            albedo = (albedo * 255).astype(np.uint8)
+            albedo = cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR)
+            albedo_blob = cv2.imencode('.png', albedo)[1].tobytes()
+            # update primitive
+            gltf.meshes[0].primitives[0].attributes.TEXCOORD_0 = 2
+            gltf.meshes[0].primitives[0].material = 0
+            # update materials
+            gltf.materials.append(pygltflib.Material(
+                pbrMetallicRoughness=pygltflib.PbrMetallicRoughness(
+                    baseColorTexture=pygltflib.TextureInfo(index=0, texCoord=0),
+                    metallicFactor=0.0,
+                    roughnessFactor=1.0,
+                ),
+                alphaMode=pygltflib.OPAQUE,
+                alphaCutoff=None,
+                doubleSided=True,
+            ))
+            gltf.textures.append(pygltflib.Texture(sampler=0, source=0))
+            gltf.samplers.append(pygltflib.Sampler(magFilter=pygltflib.LINEAR, minFilter=pygltflib.LINEAR_MIPMAP_LINEAR, wrapS=pygltflib.REPEAT, wrapT=pygltflib.REPEAT))
+            gltf.images.append(pygltflib.Image(bufferView=3, mimeType="image/png"))
+            # update buffers
+            gltf.bufferViews.append(
+                # index = 2, texcoords; as vec2 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=byteOffset,
+                    byteLength=len(vt_np_blob),
+                    byteStride=8, # vec2
+                    target=pygltflib.ARRAY_BUFFER,
+                )
+            )
+            gltf.accessors.append(
+                # 2 = texcoords
+                pygltflib.Accessor(
+                    bufferView=2,
+                    componentType=pygltflib.FLOAT,
+                    count=len(vt_np),
+                    type=pygltflib.VEC2,
+                    max=vt_np.max(axis=0).tolist(),
+                    min=vt_np.min(axis=0).tolist(),
+                )
+            )
+            blob += vt_np_blob
+            byteOffset += len(vt_np_blob)
+            gltf.bufferViews.append(
+                # index = 3, albedo texture; as none target
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=byteOffset,
+                    byteLength=len(albedo_blob),
+                )
+            )
+            blob += albedo_blob
+            byteOffset += len(albedo_blob)
+            gltf.buffers[0].byteLength = byteOffset
+            # append metllic roughness
+            if self.metallicRoughness is not None:
+                metallicRoughness = self.metallicRoughness.detach().cpu().numpy()
+                metallicRoughness = (metallicRoughness * 255).astype(np.uint8)
+                metallicRoughness = cv2.cvtColor(metallicRoughness, cv2.COLOR_RGB2BGR)
+                metallicRoughness_blob = cv2.imencode('.png', metallicRoughness)[1].tobytes()
+                # update texture definition
+                gltf.materials[0].pbrMetallicRoughness.metallicFactor = 1.0
+                gltf.materials[0].pbrMetallicRoughness.roughnessFactor = 1.0
+                gltf.materials[0].pbrMetallicRoughness.metallicRoughnessTexture = pygltflib.TextureInfo(index=1, texCoord=0)
+                gltf.textures.append(pygltflib.Texture(sampler=1, source=1))
+                gltf.samplers.append(pygltflib.Sampler(magFilter=pygltflib.LINEAR, minFilter=pygltflib.LINEAR_MIPMAP_LINEAR, wrapS=pygltflib.REPEAT, wrapT=pygltflib.REPEAT))
+                gltf.images.append(pygltflib.Image(bufferView=4, mimeType="image/png"))
+                # update buffers
+                gltf.bufferViews.append(
+                    # index = 4, metallicRoughness texture; as none target
+                    pygltflib.BufferView(
+                        buffer=0,
+                        byteOffset=byteOffset,
+                        byteLength=len(metallicRoughness_blob),
+                    )
+                )
+                blob += metallicRoughness_blob
+                byteOffset += len(metallicRoughness_blob)
+                gltf.buffers[0].byteLength = byteOffset
+        # set actual data
+        gltf.set_binary_blob(blob)
+        # glb = b"".join(gltf.save_to_bytes())
+        gltf.save(path)
+    def write_obj(self, path):
+        """write the mesh in obj format. Will also write the texture and mtl files.
+        Args:
+            path (str): path to write.
+        """
+        mtl_path = path.replace(".obj", ".mtl")
+        albedo_path = path.replace(".obj", "_albedo.png")
+        metallic_path = path.replace(".obj", "_metallic.png")
+        roughness_path = path.replace(".obj", "_roughness.png")
+        v_np = self.v.detach().cpu().numpy()
+        vt_np = self.vt.detach().cpu().numpy() if self.vt is not None else None
+        vn_np = self.vn.detach().cpu().numpy() if self.vn is not None else None
+        f_np = self.f.detach().cpu().numpy()
+        ft_np = self.ft.detach().cpu().numpy() if self.ft is not None else None
+        fn_np = self.fn.detach().cpu().numpy() if self.fn is not None else None
+        with open(path, "w") as fp:
+            fp.write(f"mtllib {os.path.basename(mtl_path)} \n")
+            for v in v_np:
+                fp.write(f"v {v[0]} {v[1]} {v[2]} \n")
+            if vt_np is not None:
+                for v in vt_np:
+                    fp.write(f"vt {v[0]} {1 - v[1]} \n")
+            if vn_np is not None:
+                for v in vn_np:
+                    fp.write(f"vn {v[0]} {v[1]} {v[2]} \n")
+            fp.write(f"usemtl defaultMat \n")
+            for i in range(len(f_np)):
+                fp.write(
+                    f'f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1 if ft_np is not None else ""}/{fn_np[i, 0] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 1] + 1}/{ft_np[i, 1] + 1 if ft_np is not None else ""}/{fn_np[i, 1] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 2] + 1}/{ft_np[i, 2] + 1 if ft_np is not None else ""}/{fn_np[i, 2] + 1 if fn_np is not None else ""} \n'
+                )
+        with open(mtl_path, "w") as fp:
+            fp.write(f"newmtl defaultMat \n")
+            fp.write(f"Ka 1 1 1 \n")
+            fp.write(f"Kd 1 1 1 \n")
+            fp.write(f"Ks 0 0 0 \n")
+            fp.write(f"Tr 1 \n")
+            fp.write(f"illum 1 \n")
+            fp.write(f"Ns 0 \n")
+            if self.albedo is not None:
+                fp.write(f"map_Kd {os.path.basename(albedo_path)} \n")
+            if self.metallicRoughness is not None:
+                # ref: https://en.wikipedia.org/wiki/Wavefront_.obj_file#Physically-based_Rendering
+                fp.write(f"map_Pm {os.path.basename(metallic_path)} \n")
+                fp.write(f"map_Pr {os.path.basename(roughness_path)} \n")
+        if self.albedo is not None:
+            albedo = self.albedo.detach().cpu().numpy()
+            albedo = (albedo * 255).astype(np.uint8)
+            cv2.imwrite(albedo_path, cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR))
+        if self.metallicRoughness is not None:
+            metallicRoughness = self.metallicRoughness.detach().cpu().numpy()
+            metallicRoughness = (metallicRoughness * 255).astype(np.uint8)
+            cv2.imwrite(metallic_path, metallicRoughness[..., 2])
+            cv2.imwrite(roughness_path, metallicRoughness[..., 1])

mesh_recon/models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+models = {}
+def register(name):
+    def decorator(cls):
+        models[name] = cls
+        return cls
+    return decorator
+def make(name, config):
+    model = models[name](config)
+    return model
+from . import nerf, neus, geometry, texture

mesh_recon/models/base.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+import torch.nn as nn
+from utils.misc import get_rank
+class BaseModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.rank = get_rank()
+        self.setup()
+        if self.config.get('weights', None):
+            self.load_state_dict(torch.load(self.config.weights))
+    def setup(self):
+        raise NotImplementedError
+    def update_step(self, epoch, global_step):
+        pass
+    def train(self, mode=True):
+        return super().train(mode=mode)
+    def eval(self):
+        return super().eval()
+    def regularizations(self, out):
+        return {}
+    @torch.no_grad()
+    def export(self, export_config):
+        return {}

mesh_recon/models/geometry.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorch_lightning.utilities.rank_zero import rank_zero_info
+import models
+from models.base import BaseModel
+from models.utils import scale_anything, get_activation, cleanup, chunk_batch
+from models.network_utils import get_encoding, get_mlp, get_encoding_with_network
+from utils.misc import get_rank
+from systems.utils import update_module_step
+from nerfacc import ContractionType
+def contract_to_unisphere(x, radius, contraction_type):
+    if contraction_type == ContractionType.AABB:
+        x = scale_anything(x, (-radius, radius), (0, 1))
+    elif contraction_type == ContractionType.UN_BOUNDED_SPHERE:
+        x = scale_anything(x, (-radius, radius), (0, 1))
+        x = x * 2 - 1  # aabb is at [-1, 1]
+        mag = x.norm(dim=-1, keepdim=True)
+        mask = mag.squeeze(-1) > 1
+        x[mask] = (2 - 1 / mag[mask]) * (x[mask] / mag[mask])
+        x = x / 4 + 0.5  # [-inf, inf] is at [0, 1]
+    else:
+        raise NotImplementedError
+    return x
+class MarchingCubeHelper(nn.Module):
+    def __init__(self, resolution, use_torch=True):
+        super().__init__()
+        self.resolution = resolution
+        self.use_torch = use_torch
+        self.points_range = (0, 1)
+        if self.use_torch:
+            import torchmcubes
+            self.mc_func = torchmcubes.marching_cubes
+        else:
+            import mcubes
+            self.mc_func = mcubes.marching_cubes
+        self.verts = None
+    def grid_vertices(self):
+        if self.verts is None:
+            x, y, z = torch.linspace(*self.points_range, self.resolution), torch.linspace(*self.points_range, self.resolution), torch.linspace(*self.points_range, self.resolution)
+            x, y, z = torch.meshgrid(x, y, z, indexing='ij')
+            verts = torch.cat([x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)], dim=-1).reshape(-1, 3)
+            self.verts = verts
+        return self.verts
+    def forward(self, level, threshold=0.):
+        level = level.float().view(self.resolution, self.resolution, self.resolution)
+        if self.use_torch:
+            verts, faces = self.mc_func(level.to(get_rank()), threshold)
+            verts, faces = verts.cpu(), faces.cpu().long()
+        else:
+            verts, faces = self.mc_func(-level.numpy(), threshold) # transform to numpy
+            verts, faces = torch.from_numpy(verts.astype(np.float32)), torch.from_numpy(faces.astype(np.int64)) # transform back to pytorch
+        verts = verts / (self.resolution - 1.)
+        return {
+            'v_pos': verts,
+            't_pos_idx': faces
+        }
+class BaseImplicitGeometry(BaseModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if self.config.isosurface is not None:
+            assert self.config.isosurface.method in ['mc', 'mc-torch']
+            if self.config.isosurface.method == 'mc-torch':
+                raise NotImplementedError("Please do not use mc-torch. It currently has some scaling issues I haven't fixed yet.")
+            self.helper = MarchingCubeHelper(self.config.isosurface.resolution, use_torch=self.config.isosurface.method=='mc-torch')
+        self.radius = self.config.radius
+        self.contraction_type = None # assigned in system
+    def forward_level(self, points):
+        raise NotImplementedError
+    def isosurface_(self, vmin, vmax):
+        def batch_func(x):
+            x = torch.stack([
+                scale_anything(x[...,0], (0, 1), (vmin[0], vmax[0])),
+                scale_anything(x[...,1], (0, 1), (vmin[1], vmax[1])),
+                scale_anything(x[...,2], (0, 1), (vmin[2], vmax[2])),
+            ], dim=-1).to(self.rank)
+            rv = self.forward_level(x).cpu()
+            cleanup()
+            return rv
+        level = chunk_batch(batch_func, self.config.isosurface.chunk, True, self.helper.grid_vertices())
+        mesh = self.helper(level, threshold=self.config.isosurface.threshold)
+        mesh['v_pos'] = torch.stack([
+            scale_anything(mesh['v_pos'][...,0], (0, 1), (vmin[0], vmax[0])),
+            scale_anything(mesh['v_pos'][...,1], (0, 1), (vmin[1], vmax[1])),
+            scale_anything(mesh['v_pos'][...,2], (0, 1), (vmin[2], vmax[2]))
+        ], dim=-1)
+        return mesh
+    @torch.no_grad()
+    def isosurface(self):
+        if self.config.isosurface is None:
+            raise NotImplementedError
+        mesh_coarse = self.isosurface_((-self.radius, -self.radius, -self.radius), (self.radius, self.radius, self.radius))
+        vmin, vmax = mesh_coarse['v_pos'].amin(dim=0), mesh_coarse['v_pos'].amax(dim=0)
+        vmin_ = (vmin - (vmax - vmin) * 0.1).clamp(-self.radius, self.radius)
+        vmax_ = (vmax + (vmax - vmin) * 0.1).clamp(-self.radius, self.radius)
+        mesh_fine = self.isosurface_(vmin_, vmax_)
+        return mesh_fine
+@models.register('volume-density')
+class VolumeDensity(BaseImplicitGeometry):
+    def setup(self):
+        self.n_input_dims = self.config.get('n_input_dims', 3)
+        self.n_output_dims = self.config.feature_dim
+        self.encoding_with_network = get_encoding_with_network(self.n_input_dims, self.n_output_dims, self.config.xyz_encoding_config, self.config.mlp_network_config)
+    def forward(self, points):
+        points = contract_to_unisphere(points, self.radius, self.contraction_type)
+        out = self.encoding_with_network(points.view(-1, self.n_input_dims)).view(*points.shape[:-1], self.n_output_dims).float()
+        density, feature = out[...,0], out
+        if 'density_activation' in self.config:
+            density = get_activation(self.config.density_activation)(density + float(self.config.density_bias))
+        if 'feature_activation' in self.config:
+            feature = get_activation(self.config.feature_activation)(feature)
+        return density, feature
+    def forward_level(self, points):
+        points = contract_to_unisphere(points, self.radius, self.contraction_type)
+        density = self.encoding_with_network(points.reshape(-1, self.n_input_dims)).reshape(*points.shape[:-1], self.n_output_dims)[...,0]
+        if 'density_activation' in self.config:
+            density = get_activation(self.config.density_activation)(density + float(self.config.density_bias))
+        return -density
+    def update_step(self, epoch, global_step):
+        update_module_step(self.encoding_with_network, epoch, global_step)
+@models.register('volume-sdf')
+class VolumeSDF(BaseImplicitGeometry):
+    def setup(self):
+        self.n_output_dims = self.config.feature_dim
+        encoding = get_encoding(3, self.config.xyz_encoding_config)
+        network = get_mlp(encoding.n_output_dims, self.n_output_dims, self.config.mlp_network_config)
+        self.encoding, self.network = encoding, network
+        self.grad_type = self.config.grad_type
+        self.finite_difference_eps = self.config.get('finite_difference_eps', 1e-3)
+        # the actual value used in training
+        # will update at certain steps if finite_difference_eps="progressive"
+        self._finite_difference_eps = None
+        if self.grad_type == 'finite_difference':
+            rank_zero_info(f"Using finite difference to compute gradients with eps={self.finite_difference_eps}")
+    def forward(self, points, with_grad=True, with_feature=True, with_laplace=False):
+        with torch.inference_mode(torch.is_inference_mode_enabled() and not (with_grad and self.grad_type == 'analytic')):
+            with torch.set_grad_enabled(self.training or (with_grad and self.grad_type == 'analytic')):
+                if with_grad and self.grad_type == 'analytic':
+                    if not self.training:
+                        points = points.clone() # points may be in inference mode, get a copy to enable grad
+                    points.requires_grad_(True)
+                points_ = points # points in the original scale
+                points = contract_to_unisphere(points, self.radius, self.contraction_type) # points normalized to (0, 1)
+                out = self.network(self.encoding(points.view(-1, 3))).view(*points.shape[:-1], self.n_output_dims).float()
+                sdf, feature = out[...,0], out
+                if 'sdf_activation' in self.config:
+                    sdf = get_activation(self.config.sdf_activation)(sdf + float(self.config.sdf_bias))
+                if 'feature_activation' in self.config:
+                    feature = get_activation(self.config.feature_activation)(feature)
+                if with_grad:
+                    if self.grad_type == 'analytic':
+                        grad = torch.autograd.grad(
+                            sdf, points_, grad_outputs=torch.ones_like(sdf),
+                            create_graph=True, retain_graph=True, only_inputs=True
+                        )[0]
+                    elif self.grad_type == 'finite_difference':
+                        eps = self._finite_difference_eps
+                        offsets = torch.as_tensor(
+                            [
+                                [eps, 0.0, 0.0],
+                                [-eps, 0.0, 0.0],
+                                [0.0, eps, 0.0],
+                                [0.0, -eps, 0.0],
+                                [0.0, 0.0, eps],
+                                [0.0, 0.0, -eps],
+                            ]
+                        ).to(points_)
+                        points_d_ = (points_[...,None,:] + offsets).clamp(-self.radius, self.radius)
+                        points_d = scale_anything(points_d_, (-self.radius, self.radius), (0, 1))
+                        points_d_sdf = self.network(self.encoding(points_d.view(-1, 3)))[...,0].view(*points.shape[:-1], 6).float()
+                        grad = 0.5 * (points_d_sdf[..., 0::2] - points_d_sdf[..., 1::2]) / eps
+                        if with_laplace:
+                            laplace = (points_d_sdf[..., 0::2] + points_d_sdf[..., 1::2] - 2 * sdf[..., None]).sum(-1) / (eps ** 2)
+        rv = [sdf]
+        if with_grad:
+            rv.append(grad)
+        if with_feature:
+            rv.append(feature)
+        if with_laplace:
+            assert self.config.grad_type == 'finite_difference', "Laplace computation is only supported with grad_type='finite_difference'"
+            rv.append(laplace)
+        rv = [v if self.training else v.detach() for v in rv]
+        return rv[0] if len(rv) == 1 else rv
+    def forward_level(self, points):
+        points = contract_to_unisphere(points, self.radius, self.contraction_type) # points normalized to (0, 1)
+        sdf = self.network(self.encoding(points.view(-1, 3))).view(*points.shape[:-1], self.n_output_dims)[...,0]
+        if 'sdf_activation' in self.config:
+            sdf = get_activation(self.config.sdf_activation)(sdf + float(self.config.sdf_bias))
+        return sdf
+    def update_step(self, epoch, global_step):
+        update_module_step(self.encoding, epoch, global_step)
+        update_module_step(self.network, epoch, global_step)
+        if self.grad_type == 'finite_difference':
+            if isinstance(self.finite_difference_eps, float):
+                self._finite_difference_eps = self.finite_difference_eps
+            elif self.finite_difference_eps == 'progressive':
+                hg_conf = self.config.xyz_encoding_config
+                assert hg_conf.otype == "ProgressiveBandHashGrid", "finite_difference_eps='progressive' only works with ProgressiveBandHashGrid"
+                current_level = min(
+                    hg_conf.start_level + max(global_step - hg_conf.start_step, 0) // hg_conf.update_steps,
+                    hg_conf.n_levels
+                )
+                grid_res = hg_conf.base_resolution * hg_conf.per_level_scale**(current_level - 1)
+                grid_size = 2 * self.config.radius / grid_res
+                if grid_size != self._finite_difference_eps:
+                    rank_zero_info(f"Update finite_difference_eps to {grid_size}")
+                self._finite_difference_eps = grid_size
+            else:
+                raise ValueError(f"Unknown finite_difference_eps={self.finite_difference_eps}")

mesh_recon/models/nerf.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import models
+from models.base import BaseModel
+from models.utils import chunk_batch
+from systems.utils import update_module_step
+from nerfacc import ContractionType, OccupancyGrid, ray_marching, render_weight_from_density, accumulate_along_rays
+@models.register('nerf')
+class NeRFModel(BaseModel):
+    def setup(self):
+        self.geometry = models.make(self.config.geometry.name, self.config.geometry)
+        self.texture = models.make(self.config.texture.name, self.config.texture)
+        self.register_buffer('scene_aabb', torch.as_tensor([-self.config.radius, -self.config.radius, -self.config.radius, self.config.radius, self.config.radius, self.config.radius], dtype=torch.float32))
+        if self.config.learned_background:
+            self.occupancy_grid_res = 256
+            self.near_plane, self.far_plane = 0.2, 1e4
+            self.cone_angle = 10**(math.log10(self.far_plane) / self.config.num_samples_per_ray) - 1. # approximate
+            self.render_step_size = 0.01 # render_step_size = max(distance_to_camera * self.cone_angle, self.render_step_size)
+            self.contraction_type = ContractionType.UN_BOUNDED_SPHERE
+        else:
+            self.occupancy_grid_res = 128
+            self.near_plane, self.far_plane = None, None
+            self.cone_angle = 0.0
+            self.render_step_size = 1.732 * 2 * self.config.radius / self.config.num_samples_per_ray
+            self.contraction_type = ContractionType.AABB
+        self.geometry.contraction_type = self.contraction_type
+        if self.config.grid_prune:
+            self.occupancy_grid = OccupancyGrid(
+                roi_aabb=self.scene_aabb,
+                resolution=self.occupancy_grid_res,
+                contraction_type=self.contraction_type
+            )
+        self.randomized = self.config.randomized
+        self.background_color = None
+    def update_step(self, epoch, global_step):
+        update_module_step(self.geometry, epoch, global_step)
+        update_module_step(self.texture, epoch, global_step)
+        def occ_eval_fn(x):
+            density, _ = self.geometry(x)
+            # approximate for 1 - torch.exp(-density[...,None] * self.render_step_size) based on taylor series
+            return density[...,None] * self.render_step_size
+        if self.training and self.config.grid_prune:
+            self.occupancy_grid.every_n_step(step=global_step, occ_eval_fn=occ_eval_fn)
+    def isosurface(self):
+        mesh = self.geometry.isosurface()
+        return mesh
+    def forward_(self, rays):
+        n_rays = rays.shape[0]
+        rays_o, rays_d = rays[:, 0:3], rays[:, 3:6] # both (N_rays, 3)
+        def sigma_fn(t_starts, t_ends, ray_indices):
+            ray_indices = ray_indices.long()
+            t_origins = rays_o[ray_indices]
+            t_dirs = rays_d[ray_indices]
+            positions = t_origins + t_dirs * (t_starts + t_ends) / 2.
+            density, _ = self.geometry(positions)
+            return density[...,None]
+        def rgb_sigma_fn(t_starts, t_ends, ray_indices):
+            ray_indices = ray_indices.long()
+            t_origins = rays_o[ray_indices]
+            t_dirs = rays_d[ray_indices]
+            positions = t_origins + t_dirs * (t_starts + t_ends) / 2.
+            density, feature = self.geometry(positions)
+            rgb = self.texture(feature, t_dirs)
+            return rgb, density[...,None]
+        with torch.no_grad():
+            ray_indices, t_starts, t_ends = ray_marching(
+                rays_o, rays_d,
+                scene_aabb=None if self.config.learned_background else self.scene_aabb,
+                grid=self.occupancy_grid if self.config.grid_prune else None,
+                sigma_fn=sigma_fn,
+                near_plane=self.near_plane, far_plane=self.far_plane,
+                render_step_size=self.render_step_size,
+                stratified=self.randomized,
+                cone_angle=self.cone_angle,
+                alpha_thre=0.0
+            )
+        ray_indices = ray_indices.long()
+        t_origins = rays_o[ray_indices]
+        t_dirs = rays_d[ray_indices]
+        midpoints = (t_starts + t_ends) / 2.
+        positions = t_origins + t_dirs * midpoints
+        intervals = t_ends - t_starts
+        density, feature = self.geometry(positions)
+        rgb = self.texture(feature, t_dirs)
+        weights = render_weight_from_density(t_starts, t_ends, density[...,None], ray_indices=ray_indices, n_rays=n_rays)
+        opacity = accumulate_along_rays(weights, ray_indices, values=None, n_rays=n_rays)
+        depth = accumulate_along_rays(weights, ray_indices, values=midpoints, n_rays=n_rays)
+        comp_rgb = accumulate_along_rays(weights, ray_indices, values=rgb, n_rays=n_rays)
+        comp_rgb = comp_rgb + self.background_color * (1.0 - opacity)
+        out = {
+            'comp_rgb': comp_rgb,
+            'opacity': opacity,
+            'depth': depth,
+            'rays_valid': opacity > 0,
+            'num_samples': torch.as_tensor([len(t_starts)], dtype=torch.int32, device=rays.device)
+        }
+        if self.training:
+            out.update({
+                'weights': weights.view(-1),
+                'points': midpoints.view(-1),
+                'intervals': intervals.view(-1),
+                'ray_indices': ray_indices.view(-1)
+            })
+        return out
+    def forward(self, rays):
+        if self.training:
+            out = self.forward_(rays)
+        else:
+            out = chunk_batch(self.forward_, self.config.ray_chunk, True, rays)
+        return {
+            **out,
+        }
+    def train(self, mode=True):
+        self.randomized = mode and self.config.randomized
+        return super().train(mode=mode)
+    def eval(self):
+        self.randomized = False
+        return super().eval()
+    def regularizations(self, out):
+        losses = {}
+        losses.update(self.geometry.regularizations(out))
+        losses.update(self.texture.regularizations(out))
+        return losses
+    @torch.no_grad()
+    def export(self, export_config):
+        mesh = self.isosurface()
+        if export_config.export_vertex_color:
+            _, feature = chunk_batch(self.geometry, export_config.chunk_size, False, mesh['v_pos'].to(self.rank))
+            viewdirs = torch.zeros(feature.shape[0], 3).to(feature)
+            viewdirs[...,2] = -1. # set the viewing directions to be -z (looking down)
+            rgb = self.texture(feature, viewdirs).clamp(0,1)
+            mesh['v_rgb'] = rgb.cpu()
+        return mesh

mesh_recon/models/network_utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import tinycudann as tcnn
+from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_info
+from utils.misc import config_to_primitive, get_rank
+from models.utils import get_activation
+from systems.utils import update_module_step
+class VanillaFrequency(nn.Module):
+    def __init__(self, in_channels, config):
+        super().__init__()
+        self.N_freqs = config['n_frequencies']
+        self.in_channels, self.n_input_dims = in_channels, in_channels
+        self.funcs = [torch.sin, torch.cos]
+        self.freq_bands = 2**torch.linspace(0, self.N_freqs-1, self.N_freqs)
+        self.n_output_dims = self.in_channels * (len(self.funcs) * self.N_freqs)
+        self.n_masking_step = config.get('n_masking_step', 0)
+        self.update_step(None, None) # mask should be updated at the beginning each step
+    def forward(self, x):
+        out = []
+        for freq, mask in zip(self.freq_bands, self.mask):
+            for func in self.funcs:
+                out += [func(freq*x) * mask]
+        return torch.cat(out, -1)
+    def update_step(self, epoch, global_step):
+        if self.n_masking_step <= 0 or global_step is None:
+            self.mask = torch.ones(self.N_freqs, dtype=torch.float32)
+        else:
+            self.mask = (1. - torch.cos(math.pi * (global_step / self.n_masking_step * self.N_freqs - torch.arange(0, self.N_freqs)).clamp(0, 1))) / 2.
+            rank_zero_debug(f'Update mask: {global_step}/{self.n_masking_step} {self.mask}')
+class ProgressiveBandHashGrid(nn.Module):
+    def __init__(self, in_channels, config):
+        super().__init__()
+        self.n_input_dims = in_channels
+        encoding_config = config.copy()
+        encoding_config['otype'] = 'HashGrid'
+        with torch.cuda.device(get_rank()):
+            self.encoding = tcnn.Encoding(in_channels, encoding_config)
+        self.n_output_dims = self.encoding.n_output_dims
+        self.n_level = config['n_levels']
+        self.n_features_per_level = config['n_features_per_level']
+        self.start_level, self.start_step, self.update_steps = config['start_level'], config['start_step'], config['update_steps']
+        self.current_level = self.start_level
+        self.mask = torch.zeros(self.n_level * self.n_features_per_level, dtype=torch.float32, device=get_rank())
+    def forward(self, x):
+        enc = self.encoding(x)
+        enc = enc * self.mask
+        return enc
+    def update_step(self, epoch, global_step):
+        current_level = min(self.start_level + max(global_step - self.start_step, 0) // self.update_steps, self.n_level)
+        if current_level > self.current_level:
+            rank_zero_info(f'Update grid level to {current_level}')
+        self.current_level = current_level
+        self.mask[:self.current_level * self.n_features_per_level] = 1.
+class CompositeEncoding(nn.Module):
+    def __init__(self, encoding, include_xyz=False, xyz_scale=1., xyz_offset=0.):
+        super(CompositeEncoding, self).__init__()
+        self.encoding = encoding
+        self.include_xyz, self.xyz_scale, self.xyz_offset = include_xyz, xyz_scale, xyz_offset
+        self.n_output_dims = int(self.include_xyz) * self.encoding.n_input_dims + self.encoding.n_output_dims
+    def forward(self, x, *args):
+        return self.encoding(x, *args) if not self.include_xyz else torch.cat([x * self.xyz_scale + self.xyz_offset, self.encoding(x, *args)], dim=-1)
+    def update_step(self, epoch, global_step):
+        update_module_step(self.encoding, epoch, global_step)
+def get_encoding(n_input_dims, config):
+    # input suppose to be range [0, 1]
+    if config.otype == 'VanillaFrequency':
+        encoding = VanillaFrequency(n_input_dims, config_to_primitive(config))
+    elif config.otype == 'ProgressiveBandHashGrid':
+        encoding = ProgressiveBandHashGrid(n_input_dims, config_to_primitive(config))
+    else:
+        with torch.cuda.device(get_rank()):
+            encoding = tcnn.Encoding(n_input_dims, config_to_primitive(config))
+    encoding = CompositeEncoding(encoding, include_xyz=config.get('include_xyz', False), xyz_scale=2., xyz_offset=-1.)
+    return encoding
+class VanillaMLP(nn.Module):
+    def __init__(self, dim_in, dim_out, config):
+        super().__init__()
+        self.n_neurons, self.n_hidden_layers = config['n_neurons'], config['n_hidden_layers']
+        self.sphere_init, self.weight_norm = config.get('sphere_init', False), config.get('weight_norm', False)
+        self.sphere_init_radius = config.get('sphere_init_radius', 0.5)
+        self.layers = [self.make_linear(dim_in, self.n_neurons, is_first=True, is_last=False), self.make_activation()]
+        for i in range(self.n_hidden_layers - 1):
+            self.layers += [self.make_linear(self.n_neurons, self.n_neurons, is_first=False, is_last=False), self.make_activation()]
+        self.layers += [self.make_linear(self.n_neurons, dim_out, is_first=False, is_last=True)]
+        self.layers = nn.Sequential(*self.layers)
+        self.output_activation = get_activation(config['output_activation'])
+    @torch.cuda.amp.autocast(False)
+    def forward(self, x):
+        x = self.layers(x.float())
+        x = self.output_activation(x)
+        return x
+    def make_linear(self, dim_in, dim_out, is_first, is_last):
+        layer = nn.Linear(dim_in, dim_out, bias=True) # network without bias will degrade quality
+        if self.sphere_init:
+            if is_last:
+                torch.nn.init.constant_(layer.bias, -self.sphere_init_radius)
+                torch.nn.init.normal_(layer.weight, mean=math.sqrt(math.pi) / math.sqrt(dim_in), std=0.0001)
+            elif is_first:
+                torch.nn.init.constant_(layer.bias, 0.0)
+                torch.nn.init.constant_(layer.weight[:, 3:], 0.0)
+                torch.nn.init.normal_(layer.weight[:, :3], 0.0, math.sqrt(2) / math.sqrt(dim_out))
+            else:
+                torch.nn.init.constant_(layer.bias, 0.0)
+                torch.nn.init.normal_(layer.weight, 0.0, math.sqrt(2) / math.sqrt(dim_out))
+        else:
+            torch.nn.init.constant_(layer.bias, 0.0)
+            torch.nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
+        if self.weight_norm:
+            layer = nn.utils.weight_norm(layer)
+        return layer
+    def make_activation(self):
+        if self.sphere_init:
+            return nn.Softplus(beta=100)
+        else:
+            return nn.ReLU(inplace=True)
+def sphere_init_tcnn_network(n_input_dims, n_output_dims, config, network):
+    rank_zero_debug('Initialize tcnn MLP to approximately represent a sphere.')
+    """
+    from https://github.com/NVlabs/tiny-cuda-nn/issues/96
+    It's the weight matrices of each layer laid out in row-major order and then concatenated.
+    Notably: inputs and output dimensions are padded to multiples of 8 (CutlassMLP) or 16 (FullyFusedMLP).
+    The padded input dimensions get a constant value of 1.0,
+    whereas the padded output dimensions are simply ignored,
+    so the weights pertaining to those can have any value.
+    """
+    padto = 16 if config.otype == 'FullyFusedMLP' else 8
+    n_input_dims = n_input_dims + (padto - n_input_dims % padto) % padto
+    n_output_dims = n_output_dims + (padto - n_output_dims % padto) % padto
+    data = list(network.parameters())[0].data
+    assert data.shape[0] == (n_input_dims + n_output_dims) * config.n_neurons + (config.n_hidden_layers - 1) * config.n_neurons**2
+    new_data = []
+    # first layer
+    weight = torch.zeros((config.n_neurons, n_input_dims)).to(data)
+    torch.nn.init.constant_(weight[:, 3:], 0.0)
+    torch.nn.init.normal_(weight[:, :3], 0.0, math.sqrt(2) / math.sqrt(config.n_neurons))
+    new_data.append(weight.flatten())
+    # hidden layers
+    for i in range(config.n_hidden_layers - 1):
+        weight = torch.zeros((config.n_neurons, config.n_neurons)).to(data)
+        torch.nn.init.normal_(weight, 0.0, math.sqrt(2) / math.sqrt(config.n_neurons))
+        new_data.append(weight.flatten())
+    # last layer
+    weight = torch.zeros((n_output_dims, config.n_neurons)).to(data)
+    torch.nn.init.normal_(weight, mean=math.sqrt(math.pi) / math.sqrt(config.n_neurons), std=0.0001)
+    new_data.append(weight.flatten())
+    new_data = torch.cat(new_data)
+    data.copy_(new_data)
+def get_mlp(n_input_dims, n_output_dims, config):
+    if config.otype == 'VanillaMLP':
+        network = VanillaMLP(n_input_dims, n_output_dims, config_to_primitive(config))
+    else:
+        with torch.cuda.device(get_rank()):
+            network = tcnn.Network(n_input_dims, n_output_dims, config_to_primitive(config))
+            if config.get('sphere_init', False):
+                sphere_init_tcnn_network(n_input_dims, n_output_dims, config, network)
+    return network
+class EncodingWithNetwork(nn.Module):
+    def __init__(self, encoding, network):
+        super().__init__()
+        self.encoding, self.network = encoding, network
+    def forward(self, x):
+        return self.network(self.encoding(x))
+    def update_step(self, epoch, global_step):
+        update_module_step(self.encoding, epoch, global_step)
+        update_module_step(self.network, epoch, global_step)
+def get_encoding_with_network(n_input_dims, n_output_dims, encoding_config, network_config):
+    # input suppose to be range [0, 1]
+    if encoding_config.otype in ['VanillaFrequency', 'ProgressiveBandHashGrid'] \
+        or network_config.otype in ['VanillaMLP']:
+        encoding = get_encoding(n_input_dims, encoding_config)
+        network = get_mlp(encoding.n_output_dims, n_output_dims, network_config)
+        encoding_with_network = EncodingWithNetwork(encoding, network)
+    else:
+        with torch.cuda.device(get_rank()):
+            encoding_with_network = tcnn.NetworkWithInputEncoding(
+                n_input_dims=n_input_dims,
+                n_output_dims=n_output_dims,
+                encoding_config=config_to_primitive(encoding_config),
+                network_config=config_to_primitive(network_config)
+            )
+    return encoding_with_network