Spaces:

Maikou
/

Michelangelo

Runtime error

App Files Files Community

Maikou commited on Jan 22, 2024

Commit

b621857

•

1 Parent(s): 391d2ef

related files and example data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

configs/aligned_shape_latents/shapevae-256.yaml +46 -0
configs/deploy/clip_aslp_3df+3dc+abo+gso+toy+t10k+obj+sp+pk=256_01_4096_8_ckpt_250000_udt=110M_finetune_500000_deploy.yaml +181 -0
configs/deploy/clip_sp+pk_aslperceiver=256_01_4096_8_udt=03.yaml +180 -0
configs/image_cond_diffuser_asl/image-ASLDM-256.yaml +97 -0
configs/text_cond_diffuser_asl/text-ASLDM-256.yaml +98 -0
example_data/image/car.jpg +0 -0
example_data/surface/surface.npz +3 -0
gradio_cached_dir/example/img_example/airplane.jpg +0 -0
gradio_cached_dir/example/img_example/alita.jpg +0 -0
gradio_cached_dir/example/img_example/bag.jpg +0 -0
gradio_cached_dir/example/img_example/bench.jpg +0 -0
gradio_cached_dir/example/img_example/building.jpg +0 -0
gradio_cached_dir/example/img_example/burger.jpg +0 -0
gradio_cached_dir/example/img_example/car.jpg +0 -0
gradio_cached_dir/example/img_example/loopy.jpg +0 -0
gradio_cached_dir/example/img_example/mario.jpg +0 -0
gradio_cached_dir/example/img_example/ship.jpg +0 -0
michelangelo/__init__.py +1 -0
michelangelo/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/data/__init__.py +1 -0
michelangelo/data/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/data/__pycache__/asl_webdataset.cpython-39.pyc +0 -0
michelangelo/data/__pycache__/tokenizer.cpython-39.pyc +0 -0
michelangelo/data/__pycache__/transforms.cpython-39.pyc +0 -0
michelangelo/data/__pycache__/utils.cpython-39.pyc +0 -0
michelangelo/data/templates.json +69 -0
michelangelo/data/transforms.py +407 -0
michelangelo/data/utils.py +59 -0
michelangelo/graphics/__init__.py +1 -0
michelangelo/graphics/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/graphics/primitives/__init__.py +9 -0
michelangelo/graphics/primitives/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/graphics/primitives/__pycache__/extract_texture_map.cpython-39.pyc +0 -0
michelangelo/graphics/primitives/__pycache__/mesh.cpython-39.pyc +0 -0
michelangelo/graphics/primitives/__pycache__/volume.cpython-39.pyc +0 -0
michelangelo/graphics/primitives/mesh.py +114 -0
michelangelo/graphics/primitives/volume.py +21 -0
michelangelo/models/__init__.py +1 -0
michelangelo/models/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/models/asl_diffusion/__init__.py +1 -0
michelangelo/models/asl_diffusion/__pycache__/__init__.cpython-39.pyc +0 -0
michelangelo/models/asl_diffusion/__pycache__/asl_udt.cpython-39.pyc +0 -0
michelangelo/models/asl_diffusion/__pycache__/clip_asl_diffuser_pl_module.cpython-39.pyc +0 -0
michelangelo/models/asl_diffusion/__pycache__/inference_utils.cpython-39.pyc +0 -0
michelangelo/models/asl_diffusion/asl_diffuser_pl_module.py +483 -0
michelangelo/models/asl_diffusion/asl_udt.py +104 -0
michelangelo/models/asl_diffusion/base.py +13 -0
michelangelo/models/asl_diffusion/clip_asl_diffuser_pl_module.py +393 -0
michelangelo/models/asl_diffusion/inference_utils.py +80 -0
michelangelo/models/conditional_encoders/__init__.py +3 -0

configs/aligned_shape_latents/shapevae-256.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+model:
+  target: michelangelo.models.tsal.asl_pl_module.AlignedShapeAsLatentPLModule
+  params:
+    shape_module_cfg:
+      target: michelangelo.models.tsal.sal_perceiver.AlignedShapeLatentPerceiver
+      params:
+        num_latents: 256
+        embed_dim: 64
+        point_feats: 3   # normal
+        num_freqs: 8
+        include_pi: false
+        heads: 12
+        width: 768
+        num_encoder_layers: 8
+        num_decoder_layers: 16
+        use_ln_post: true
+        init_scale: 0.25
+        qkv_bias: false
+        use_checkpoint: true
+    aligned_module_cfg:
+      target: michelangelo.models.tsal.clip_asl_module.CLIPAlignedShapeAsLatentModule
+      params:
+        clip_model_version: "./checkpoints/clip/clip-vit-large-patch14"
+    loss_cfg:
+      target: michelangelo.models.tsal.loss.ContrastKLNearFar
+      params:
+        contrast_weight: 0.1
+        near_weight: 0.1
+        kl_weight: 0.001
+    optimizer_cfg:
+      optimizer:
+        target: torch.optim.AdamW
+        params:
+          betas: [0.9, 0.99]
+          eps: 1.e-6
+          weight_decay: 1.e-2
+      scheduler:
+        target: michelangelo.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
+        params:
+          warm_up_steps: 5000
+          f_start: 1.e-6
+          f_min: 1.e-3
+          f_max: 1.0

configs/deploy/clip_aslp_3df+3dc+abo+gso+toy+t10k+obj+sp+pk=256_01_4096_8_ckpt_250000_udt=110M_finetune_500000_deploy.yaml ADDED Viewed

	@@ -0,0 +1,181 @@

+name: "0630_clip_aslp_3df+3dc+abo+gso+toy+t10k+obj+sp+pk=256_01_4096_8_ckpt_250000_udt=110M_finetune_500000"
+#wandb:
+#  project: "image_diffuser"
+#  offline: false
+training:
+  steps: 500000
+  use_amp: true
+  ckpt_path: ""
+  base_lr: 1.e-4
+  gradient_clip_val: 5.0
+  gradient_clip_algorithm: "norm"
+  every_n_train_steps: 5000
+  val_check_interval: 1024
+  limit_val_batches: 16
+dataset:
+  target: michelangelo.data.asl_webdataset.MultiAlignedShapeLatentModule
+  params:
+    batch_size: 38
+    num_workers: 4
+    val_num_workers: 4
+    buffer_size: 256
+    return_normal: true
+    random_crop: false
+    surface_sampling: true
+    pc_size: &pc_size 4096
+    image_size: 384
+    mean: &mean [0.5, 0.5, 0.5]
+    std: &std [0.5, 0.5, 0.5]
+    cond_stage_key: "image"
+    meta_info:
+      3D-FUTURE:
+        render_folder: "/root/workspace/cq_workspace/datasets/3D-FUTURE/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/3D-FUTURE"
+      ABO:
+        render_folder: "/root/workspace/cq_workspace/datasets/ABO/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/ABO"
+      GSO:
+        render_folder: "/root/workspace/cq_workspace/datasets/GSO/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/GSO"
+      TOYS4K:
+        render_folder: "/root/workspace/cq_workspace/datasets/TOYS4K/TOYS4K/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/TOYS4K"
+      3DCaricShop:
+        render_folder: "/root/workspace/cq_workspace/datasets/3DCaricShop/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/3DCaricShop"
+      Thingi10K:
+        render_folder: "/root/workspace/cq_workspace/datasets/Thingi10K/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/Thingi10K"
+      shapenet:
+        render_folder: "/root/workspace/cq_workspace/datasets/shapenet/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/shapenet"
+      pokemon:
+        render_folder: "/root/workspace/cq_workspace/datasets/pokemon/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/pokemon"
+      objaverse:
+        render_folder: "/root/workspace/cq_workspace/datasets/objaverse/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/objaverse"
+model:
+  target: michelangelo.models.asl_diffusion.clip_asl_diffuser_pl_module.ClipASLDiffuser
+  params:
+    first_stage_config:
+      target: michelangelo.models.tsal.asl_pl_module.AlignedShapeAsLatentPLModule
+      params:
+        shape_module_cfg:
+          target: michelangelo.models.tsal.sal_perceiver.AlignedShapeLatentPerceiver
+          params:
+            num_latents: &num_latents 256
+            embed_dim: &embed_dim 64
+            point_feats: 3   # normal
+            num_freqs: 8
+            include_pi: false
+            heads: 12
+            width: 768
+            num_encoder_layers: 8
+            num_decoder_layers: 16
+            use_ln_post: true
+            init_scale: 0.25
+            qkv_bias: false
+            use_checkpoint: false
+        aligned_module_cfg:
+          target: michelangelo.models.tsal.clip_asl_module.CLIPAlignedShapeAsLatentModule
+          params:
+            clip_model_version: "/mnt/shadow_cv_training/stevenxxliu/checkpoints/clip/clip-vit-large-patch14"
+            # clip_model_version: "/root/workspace/checkpoints/clip/clip-vit-large-patch14"
+        loss_cfg:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: michelangelo.models.conditional_encoders.encoder_factory.FrozenCLIPImageGridEmbedder
+      params:
+        version: "/mnt/shadow_cv_training/stevenxxliu/checkpoints/clip/clip-vit-large-patch14"
+        # version: "/root/workspace/checkpoints/clip/clip-vit-large-patch14"
+        zero_embedding_radio: 0.1
+    first_stage_key: "surface"
+    cond_stage_key: "image"
+    scale_by_std: false
+    denoiser_cfg:
+      target: michelangelo.models.asl_diffusion.asl_udt.ConditionalASLUDTDenoiser
+      params:
+        input_channels: *embed_dim
+        output_channels: *embed_dim
+        n_ctx: *num_latents
+        width: 768
+        layers: 6   # 2 * 6 + 1 = 13
+        heads: 12
+        context_dim: 1024
+        init_scale: 1.0
+        skip_ln: true
+        use_checkpoint: true
+    scheduler_cfg:
+      guidance_scale: 7.5
+      num_inference_steps: 50
+      eta: 0.0
+      noise:
+        target: diffusers.schedulers.DDPMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          variance_type: "fixed_small"
+          clip_sample: false
+      denoise:
+        target: diffusers.schedulers.DDIMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          clip_sample: false   # clip sample to -1~1
+          set_alpha_to_one: false
+          steps_offset: 1
+    optimizer_cfg:
+      optimizer:
+        target: torch.optim.AdamW
+        params:
+          betas: [0.9, 0.99]
+          eps: 1.e-6
+          weight_decay: 1.e-2
+      scheduler:
+        target: michelangelo.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
+        params:
+          warm_up_steps: 5000
+          f_start: 1.e-6
+          f_min: 1.e-3
+          f_max: 1.0
+    loss_cfg:
+      loss_type: "mse"
+logger:
+  target: michelangelo.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
+  params:
+    step_frequency: 2000
+    num_samples: 4
+    sample_times: 4
+    mean: *mean
+    std: *std
+    bounds: [-1.1, -1.1, -1.1, 1.1, 1.1, 1.1]
+    octree_depth: 7
+    num_chunks: 10000

configs/deploy/clip_sp+pk_aslperceiver=256_01_4096_8_udt=03.yaml ADDED Viewed

	@@ -0,0 +1,180 @@

+name: "0428_clip_subsp+pk_sal_perceiver=256_01_4096_8_udt=03"
+#wandb:
+#  project: "image_diffuser"
+#  offline: false
+training:
+  steps: 500000
+  use_amp: true
+  ckpt_path: ""
+  base_lr: 1.e-4
+  gradient_clip_val: 5.0
+  gradient_clip_algorithm: "norm"
+  every_n_train_steps: 5000
+  val_check_interval: 1024
+  limit_val_batches: 16
+# dataset
+dataset:
+  target: michelangelo.data.asl_torch_dataset.MultiAlignedShapeImageTextModule
+  params:
+    batch_size: 38
+    num_workers: 4
+    val_num_workers: 4
+    buffer_size: 256
+    return_normal: true
+    random_crop: false
+    surface_sampling: true
+    pc_size: &pc_size 4096
+    image_size: 384
+    mean: &mean [0.5, 0.5, 0.5]
+    std: &std [0.5, 0.5, 0.5]
+    cond_stage_key: "text"
+    meta_info:
+      3D-FUTURE:
+        render_folder: "/root/workspace/cq_workspace/datasets/3D-FUTURE/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/3D-FUTURE"
+      ABO:
+        render_folder: "/root/workspace/cq_workspace/datasets/ABO/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/ABO"
+      GSO:
+        render_folder: "/root/workspace/cq_workspace/datasets/GSO/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/GSO"
+      TOYS4K:
+        render_folder: "/root/workspace/cq_workspace/datasets/TOYS4K/TOYS4K/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/TOYS4K"
+      3DCaricShop:
+        render_folder: "/root/workspace/cq_workspace/datasets/3DCaricShop/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/3DCaricShop"
+      Thingi10K:
+        render_folder: "/root/workspace/cq_workspace/datasets/Thingi10K/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/Thingi10K"
+      shapenet:
+        render_folder: "/root/workspace/cq_workspace/datasets/shapenet/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/shapenet"
+      pokemon:
+        render_folder: "/root/workspace/cq_workspace/datasets/pokemon/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/pokemon"
+      objaverse:
+        render_folder: "/root/workspace/cq_workspace/datasets/objaverse/renders"
+        tar_folder: "/root/workspace/datasets/make_tars/objaverse"
+model:
+  target: michelangelo.models.asl_diffusion.clip_asl_diffuser_pl_module.ClipASLDiffuser
+  params:
+    first_stage_config:
+      target: michelangelo.models.tsal.asl_pl_module.AlignedShapeAsLatentPLModule
+      params:
+        # ckpt_path: "/root/workspace/cq_workspace/michelangelo/experiments/aligned_shape_latents/clip_aslperceiver_sp+pk_01_01/ckpt/ckpt-step=00230000.ckpt"
+        shape_module_cfg:
+          target: michelangelo.models.tsal.sal_perceiver.AlignedShapeLatentPerceiver
+          params:
+            num_latents: &num_latents 256
+            embed_dim: &embed_dim 64
+            point_feats: 3   # normal
+            num_freqs: 8
+            include_pi: false
+            heads: 12
+            width: 768
+            num_encoder_layers: 8
+            num_decoder_layers: 16
+            use_ln_post: true
+            init_scale: 0.25
+            qkv_bias: false
+            use_checkpoint: true
+        aligned_module_cfg:
+          target: michelangelo.models.tsal.clip_asl_module.CLIPAlignedShapeAsLatentModule
+          params:
+            clip_model_version: "/mnt/shadow_cv_training/stevenxxliu/checkpoints/clip/clip-vit-large-patch14"
+        loss_cfg:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: michelangelo.models.conditional_encoders.encoder_factory.FrozenAlignedCLIPTextEmbedder
+      params:
+        version: "/mnt/shadow_cv_training/stevenxxliu/checkpoints/clip/clip-vit-large-patch14"
+        zero_embedding_radio: 0.1
+        max_length: 77
+    first_stage_key: "surface"
+    cond_stage_key: "text"
+    scale_by_std: false
+    denoiser_cfg:
+      target: michelangelo.models.asl_diffusion.asl_udt.ConditionalASLUDTDenoiser
+      params:
+        input_channels: *embed_dim
+        output_channels: *embed_dim
+        n_ctx: *num_latents
+        width: 768
+        layers: 8   # 2 * 6 + 1 = 13
+        heads: 12
+        context_dim: 768
+        init_scale: 1.0
+        skip_ln: true
+        use_checkpoint: true
+    scheduler_cfg:
+      guidance_scale: 7.5
+      num_inference_steps: 50
+      eta: 0.0
+      noise:
+        target: diffusers.schedulers.DDPMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          variance_type: "fixed_small"
+          clip_sample: false
+      denoise:
+        target: diffusers.schedulers.DDIMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          clip_sample: false   # clip sample to -1~1
+          set_alpha_to_one: false
+          steps_offset: 1
+    optimizer_cfg:
+      optimizer:
+        target: torch.optim.AdamW
+        params:
+          betas: [0.9, 0.99]
+          eps: 1.e-6
+          weight_decay: 1.e-2
+      scheduler:
+        target: michelangelo.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
+        params:
+          warm_up_steps: 5000
+          f_start: 1.e-6
+          f_min: 1.e-3
+          f_max: 1.0
+    loss_cfg:
+      loss_type: "mse"
+logger:
+  target: michelangelo.utils.trainings.mesh_log_callback.TextConditionalASLDiffuserLogger
+  params:
+    step_frequency: 1000
+    num_samples: 4
+    sample_times: 4
+    bounds: [-1.1, -1.1, -1.1, 1.1, 1.1, 1.1]
+    octree_depth: 7
+    num_chunks: 10000

configs/image_cond_diffuser_asl/image-ASLDM-256.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+model:
+  target: michelangelo.models.asl_diffusion.clip_asl_diffuser_pl_module.ClipASLDiffuser
+  params:
+    first_stage_config:
+      target: michelangelo.models.tsal.asl_pl_module.AlignedShapeAsLatentPLModule
+      params:
+        shape_module_cfg:
+          target: michelangelo.models.tsal.sal_perceiver.AlignedShapeLatentPerceiver
+          params:
+            num_latents: &num_latents 256
+            embed_dim: &embed_dim 64
+            point_feats: 3   # normal
+            num_freqs: 8
+            include_pi: false
+            heads: 12
+            width: 768
+            num_encoder_layers: 8
+            num_decoder_layers: 16
+            use_ln_post: true
+            init_scale: 0.25
+            qkv_bias: false
+            use_checkpoint: false
+        aligned_module_cfg:
+          target: michelangelo.models.tsal.clip_asl_module.CLIPAlignedShapeAsLatentModule
+          params:
+            clip_model_version: "./checkpoints/clip/clip-vit-large-patch14"
+        loss_cfg:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: michelangelo.models.conditional_encoders.encoder_factory.FrozenCLIPImageGridEmbedder
+      params:
+        version: "./checkpoints/clip/clip-vit-large-patch14"
+        zero_embedding_radio: 0.1
+    first_stage_key: "surface"
+    cond_stage_key: "image"
+    scale_by_std: false
+    denoiser_cfg:
+      target: michelangelo.models.asl_diffusion.asl_udt.ConditionalASLUDTDenoiser
+      params:
+        input_channels: *embed_dim
+        output_channels: *embed_dim
+        n_ctx: *num_latents
+        width: 768
+        layers: 6   # 2 * 6 + 1 = 13
+        heads: 12
+        context_dim: 1024
+        init_scale: 1.0
+        skip_ln: true
+        use_checkpoint: true
+    scheduler_cfg:
+      guidance_scale: 7.5
+      num_inference_steps: 50
+      eta: 0.0
+      noise:
+        target: diffusers.schedulers.DDPMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          variance_type: "fixed_small"
+          clip_sample: false
+      denoise:
+        target: diffusers.schedulers.DDIMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          clip_sample: false   # clip sample to -1~1
+          set_alpha_to_one: false
+          steps_offset: 1
+    optimizer_cfg:
+      optimizer:
+        target: torch.optim.AdamW
+        params:
+          betas: [0.9, 0.99]
+          eps: 1.e-6
+          weight_decay: 1.e-2
+      scheduler:
+        target: michelangelo.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
+        params:
+          warm_up_steps: 5000
+          f_start: 1.e-6
+          f_min: 1.e-3
+          f_max: 1.0
+    loss_cfg:
+      loss_type: "mse"

configs/text_cond_diffuser_asl/text-ASLDM-256.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+model:
+  target: michelangelo.models.asl_diffusion.clip_asl_diffuser_pl_module.ClipASLDiffuser
+  params:
+    first_stage_config:
+      target: michelangelo.models.tsal.asl_pl_module.AlignedShapeAsLatentPLModule
+      params:
+        shape_module_cfg:
+          target: michelangelo.models.tsal.sal_perceiver.AlignedShapeLatentPerceiver
+          params:
+            num_latents: &num_latents 256
+            embed_dim: &embed_dim 64
+            point_feats: 3   # normal
+            num_freqs: 8
+            include_pi: false
+            heads: 12
+            width: 768
+            num_encoder_layers: 8
+            num_decoder_layers: 16
+            use_ln_post: true
+            init_scale: 0.25
+            qkv_bias: false
+            use_checkpoint: true
+        aligned_module_cfg:
+          target: michelangelo.models.tsal.clip_asl_module.CLIPAlignedShapeAsLatentModule
+          params:
+            clip_model_version: "./checkpoints/clip/clip-vit-large-patch14"
+        loss_cfg:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: michelangelo.models.conditional_encoders.encoder_factory.FrozenAlignedCLIPTextEmbedder
+      params:
+        version: "./checkpoints/clip/clip-vit-large-patch14"
+        zero_embedding_radio: 0.1
+        max_length: 77
+    first_stage_key: "surface"
+    cond_stage_key: "text"
+    scale_by_std: false
+    denoiser_cfg:
+      target: michelangelo.models.asl_diffusion.asl_udt.ConditionalASLUDTDenoiser
+      params:
+        input_channels: *embed_dim
+        output_channels: *embed_dim
+        n_ctx: *num_latents
+        width: 768
+        layers: 8   # 2 * 6 + 1 = 13
+        heads: 12
+        context_dim: 768
+        init_scale: 1.0
+        skip_ln: true
+        use_checkpoint: true
+    scheduler_cfg:
+      guidance_scale: 7.5
+      num_inference_steps: 50
+      eta: 0.0
+      noise:
+        target: diffusers.schedulers.DDPMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          variance_type: "fixed_small"
+          clip_sample: false
+      denoise:
+        target: diffusers.schedulers.DDIMScheduler
+        params:
+          num_train_timesteps: 1000
+          beta_start: 0.00085
+          beta_end: 0.012
+          beta_schedule: "scaled_linear"
+          clip_sample: false   # clip sample to -1~1
+          set_alpha_to_one: false
+          steps_offset: 1
+    optimizer_cfg:
+      optimizer:
+        target: torch.optim.AdamW
+        params:
+          betas: [0.9, 0.99]
+          eps: 1.e-6
+          weight_decay: 1.e-2
+      scheduler:
+        target: michelangelo.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
+        params:
+          warm_up_steps: 5000
+          f_start: 1.e-6
+          f_min: 1.e-3
+          f_max: 1.0
+    loss_cfg:
+      loss_type: "mse"

example_data/image/car.jpg ADDED Viewed

example_data/surface/surface.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0893e44d82ada683baa656a718beaf6ec19fc28b6816b451f56645530d5bb962
+size 1201024

gradio_cached_dir/example/img_example/airplane.jpg ADDED Viewed

gradio_cached_dir/example/img_example/alita.jpg ADDED Viewed

gradio_cached_dir/example/img_example/bag.jpg ADDED Viewed

gradio_cached_dir/example/img_example/bench.jpg ADDED Viewed

gradio_cached_dir/example/img_example/building.jpg ADDED Viewed

gradio_cached_dir/example/img_example/burger.jpg ADDED Viewed

gradio_cached_dir/example/img_example/car.jpg ADDED Viewed

gradio_cached_dir/example/img_example/loopy.jpg ADDED Viewed

gradio_cached_dir/example/img_example/mario.jpg ADDED Viewed

gradio_cached_dir/example/img_example/ship.jpg ADDED Viewed

michelangelo/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

michelangelo/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (176 Bytes). View file

michelangelo/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

michelangelo/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (181 Bytes). View file

michelangelo/data/__pycache__/asl_webdataset.cpython-39.pyc ADDED Viewed

Binary file (9.43 kB). View file

michelangelo/data/__pycache__/tokenizer.cpython-39.pyc ADDED Viewed

Binary file (6.48 kB). View file

michelangelo/data/__pycache__/transforms.cpython-39.pyc ADDED Viewed

Binary file (11.4 kB). View file

michelangelo/data/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.13 kB). View file

michelangelo/data/templates.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+    "shape": [
+        "a point cloud model of {}.",
+        "There is a {} in the scene.",
+        "There is the {} in the scene.",
+        "a photo of a {} in the scene.",
+        "a photo of the {} in the scene.",
+        "a photo of one {} in the scene.",
+        "itap of a {}.",
+        "itap of my {}.",
+        "itap of the {}.",
+        "a photo of a {}.",
+        "a photo of my {}.",
+        "a photo of the {}.",
+        "a photo of one {}.",
+        "a photo of many {}.",
+        "a good photo of a {}.",
+        "a good photo of the {}.",
+        "a bad photo of a {}.",
+        "a bad photo of the {}.",
+        "a photo of a nice {}.",
+        "a photo of the nice {}.",
+        "a photo of a cool {}.",
+        "a photo of the cool {}.",
+        "a photo of a weird {}.",
+        "a photo of the weird {}.",
+        "a photo of a small {}.",
+        "a photo of the small {}.",
+        "a photo of a large {}.",
+        "a photo of the large {}.",
+        "a photo of a clean {}.",
+        "a photo of the clean {}.",
+        "a photo of a dirty {}.",
+        "a photo of the dirty {}.",
+        "a bright photo of a {}.",
+        "a bright photo of the {}.",
+        "a dark photo of a {}.",
+        "a dark photo of the {}.",
+        "a photo of a hard to see {}.",
+        "a photo of the hard to see {}.",
+        "a low resolution photo of a {}.",
+        "a low resolution photo of the {}.",
+        "a cropped photo of a {}.",
+        "a cropped photo of the {}.",
+        "a close-up photo of a {}.",
+        "a close-up photo of the {}.",
+        "a jpeg corrupted photo of a {}.",
+        "a jpeg corrupted photo of the {}.",
+        "a blurry photo of a {}.",
+        "a blurry photo of the {}.",
+        "a pixelated photo of a {}.",
+        "a pixelated photo of the {}.",
+        "a black and white photo of the {}.",
+        "a black and white photo of a {}",
+        "a plastic {}.",
+        "the plastic {}.",
+        "a toy {}.",
+        "the toy {}.",
+        "a plushie {}.",
+        "the plushie {}.",
+        "a cartoon {}.",
+        "the cartoon {}.",
+        "an embroidered {}.",
+        "the embroidered {}.",
+        "a painting of the {}.",
+        "a painting of a {}."
+    ]
+}

michelangelo/data/transforms.py ADDED Viewed

	@@ -0,0 +1,407 @@

+# -*- coding: utf-8 -*-
+import os
+import time
+import numpy as np
+import warnings
+import random
+from omegaconf.listconfig import ListConfig
+from webdataset import pipelinefilter
+import torch
+import torchvision.transforms.functional as TVF
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.transforms import _interpolation_modes_from_int
+from typing import Sequence
+from michelangelo.utils import instantiate_from_config
+def _uid_buffer_pick(buf_dict, rng):
+    uid_keys = list(buf_dict.keys())
+    selected_uid = rng.choice(uid_keys)
+    buf = buf_dict[selected_uid]
+    k = rng.randint(0, len(buf) - 1)
+    sample = buf[k]
+    buf[k] = buf[-1]
+    buf.pop()
+    if len(buf) == 0:
+        del buf_dict[selected_uid]
+    return sample
+def _add_to_buf_dict(buf_dict, sample):
+    key = sample["__key__"]
+    uid, uid_sample_id = key.split("_")
+    if uid not in buf_dict:
+        buf_dict[uid] = []
+    buf_dict[uid].append(sample)
+    return buf_dict
+def _uid_shuffle(data, bufsize=1000, initial=100, rng=None, handler=None):
+    """Shuffle the data in the stream.
+    This uses a buffer of size `bufsize`. Shuffling at
+    startup is less random; this is traded off against
+    yielding samples quickly.
+    data: iterator
+    bufsize: buffer size for shuffling
+    returns: iterator
+    rng: either random module or random.Random instance
+    """
+    if rng is None:
+        rng = random.Random(int((os.getpid() + time.time()) * 1e9))
+    initial = min(initial, bufsize)
+    buf_dict = dict()
+    current_samples = 0
+    for sample in data:
+        _add_to_buf_dict(buf_dict, sample)
+        current_samples += 1
+        if current_samples < bufsize:
+            try:
+                _add_to_buf_dict(buf_dict, next(data))  # skipcq: PYL-R1708
+                current_samples += 1
+            except StopIteration:
+                pass
+        if current_samples >= initial:
+            current_samples -= 1
+            yield _uid_buffer_pick(buf_dict, rng)
+    while current_samples > 0:
+        current_samples -= 1
+        yield _uid_buffer_pick(buf_dict, rng)
+uid_shuffle = pipelinefilter(_uid_shuffle)
+class RandomSample(object):
+    def __init__(self,
+                 num_volume_samples: int = 1024,
+                 num_near_samples: int = 1024):
+        super().__init__()
+        self.num_volume_samples = num_volume_samples
+        self.num_near_samples = num_near_samples
+    def __call__(self, sample):
+        rng = np.random.default_rng()
+        # 1. sample surface input
+        total_surface = sample["surface"]
+        ind = rng.choice(total_surface.shape[0], replace=False)
+        surface = total_surface[ind]
+        # 2. sample volume/near geometric points
+        vol_points = sample["vol_points"]
+        vol_label = sample["vol_label"]
+        near_points = sample["near_points"]
+        near_label = sample["near_label"]
+        ind = rng.choice(vol_points.shape[0], self.num_volume_samples, replace=False)
+        vol_points = vol_points[ind]
+        vol_label = vol_label[ind]
+        vol_points_labels = np.concatenate([vol_points, vol_label[:, np.newaxis]], axis=1)
+        ind = rng.choice(near_points.shape[0], self.num_near_samples, replace=False)
+        near_points = near_points[ind]
+        near_label = near_label[ind]
+        near_points_labels = np.concatenate([near_points, near_label[:, np.newaxis]], axis=1)
+        # concat sampled volume and near points
+        geo_points = np.concatenate([vol_points_labels, near_points_labels], axis=0)
+        sample = {
+            "surface": surface,
+            "geo_points": geo_points
+        }
+        return sample
+class SplitRandomSample(object):
+    def __init__(self,
+                 use_surface_sample: bool = False,
+                 num_surface_samples: int = 4096,
+                 num_volume_samples: int = 1024,
+                 num_near_samples: int = 1024):
+        super().__init__()
+        self.use_surface_sample = use_surface_sample
+        self.num_surface_samples = num_surface_samples
+        self.num_volume_samples = num_volume_samples
+        self.num_near_samples = num_near_samples
+    def __call__(self, sample):
+        rng = np.random.default_rng()
+        # 1. sample surface input
+        surface = sample["surface"]
+        if self.use_surface_sample:
+            replace = surface.shape[0] < self.num_surface_samples
+            ind = rng.choice(surface.shape[0], self.num_surface_samples, replace=replace)
+            surface = surface[ind]
+        # 2. sample volume/near geometric points
+        vol_points = sample["vol_points"]
+        vol_label = sample["vol_label"]
+        near_points = sample["near_points"]
+        near_label = sample["near_label"]
+        ind = rng.choice(vol_points.shape[0], self.num_volume_samples, replace=False)
+        vol_points = vol_points[ind]
+        vol_label = vol_label[ind]
+        vol_points_labels = np.concatenate([vol_points, vol_label[:, np.newaxis]], axis=1)
+        ind = rng.choice(near_points.shape[0], self.num_near_samples, replace=False)
+        near_points = near_points[ind]
+        near_label = near_label[ind]
+        near_points_labels = np.concatenate([near_points, near_label[:, np.newaxis]], axis=1)
+        # concat sampled volume and near points
+        geo_points = np.concatenate([vol_points_labels, near_points_labels], axis=0)
+        sample = {
+            "surface": surface,
+            "geo_points": geo_points
+        }
+        return sample
+class FeatureSelection(object):
+    VALID_SURFACE_FEATURE_DIMS = {
+        "none": [0, 1, 2],                              # xyz
+        "watertight_normal": [0, 1, 2, 3, 4, 5],        # xyz, normal
+        "normal": [0, 1, 2, 6, 7, 8]
+    }
+    def __init__(self, surface_feature_type: str):
+        self.surface_feature_type = surface_feature_type
+        self.surface_dims = self.VALID_SURFACE_FEATURE_DIMS[surface_feature_type]
+    def __call__(self, sample):
+        sample["surface"] = sample["surface"][:, self.surface_dims]
+        return sample
+class AxisScaleTransform(object):
+    def __init__(self, interval=(0.75, 1.25), jitter=True, jitter_scale=0.005):
+        assert isinstance(interval, (tuple, list, ListConfig))
+        self.interval = interval
+        self.min_val = interval[0]
+        self.max_val = interval[1]
+        self.inter_size = interval[1] - interval[0]
+        self.jitter = jitter
+        self.jitter_scale = jitter_scale
+    def __call__(self, sample):
+        surface = sample["surface"][..., 0:3]
+        geo_points = sample["geo_points"][..., 0:3]
+        scaling = torch.rand(1, 3) * self.inter_size + self.min_val
+        # print(scaling)
+        surface = surface * scaling
+        geo_points = geo_points * scaling
+        scale = (1 / torch.abs(surface).max().item()) * 0.999999
+        surface *= scale
+        geo_points *= scale
+        if self.jitter:
+            surface += self.jitter_scale * torch.randn_like(surface)
+            surface.clamp_(min=-1.015, max=1.015)
+        sample["surface"][..., 0:3] = surface
+        sample["geo_points"][..., 0:3] = geo_points
+        return sample
+class ToTensor(object):
+    def __init__(self, tensor_keys=("surface", "geo_points", "tex_points")):
+        self.tensor_keys = tensor_keys
+    def __call__(self, sample):
+        for key in self.tensor_keys:
+            if key not in sample:
+                continue
+            sample[key] = torch.tensor(sample[key], dtype=torch.float32)
+        return sample
+class AxisScale(object):
+    def __init__(self, interval=(0.75, 1.25), jitter=True, jitter_scale=0.005):
+        assert isinstance(interval, (tuple, list, ListConfig))
+        self.interval = interval
+        self.jitter = jitter
+        self.jitter_scale = jitter_scale
+    def __call__(self, surface, *args):
+        scaling = torch.rand(1, 3) * 0.5 + 0.75
+        # print(scaling)
+        surface = surface * scaling
+        scale = (1 / torch.abs(surface).max().item()) * 0.999999
+        surface *= scale
+        args_outputs = []
+        for _arg in args:
+            _arg = _arg * scaling * scale
+            args_outputs.append(_arg)
+        if self.jitter:
+            surface += self.jitter_scale * torch.randn_like(surface)
+            surface.clamp_(min=-1, max=1)
+        if len(args) == 0:
+            return surface
+        else:
+            return surface, *args_outputs
+class RandomResize(torch.nn.Module):
+    """Apply randomly Resize with a given probability."""
+    def __init__(
+        self,
+        size,
+        resize_radio=(0.5, 1),
+        allow_resize_interpolations=(InterpolationMode.BICUBIC, InterpolationMode.BILINEAR, InterpolationMode.BILINEAR),
+        interpolation=InterpolationMode.BICUBIC,
+        max_size=None,
+        antialias=None,
+    ):
+        super().__init__()
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(f"Size should be int or sequence. Got {type(size)}")
+        if isinstance(size, Sequence) and len(size) not in (1, 2):
+            raise ValueError("If size is a sequence, it should have 1 or 2 values")
+        self.size = size
+        self.max_size = max_size
+        # Backward compatibility with integer value
+        if isinstance(interpolation, int):
+            warnings.warn(
+                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
+                "Please use InterpolationMode enum."
+            )
+            interpolation = _interpolation_modes_from_int(interpolation)
+        self.interpolation = interpolation
+        self.antialias = antialias
+        self.resize_radio = resize_radio
+        self.allow_resize_interpolations = allow_resize_interpolations
+    def random_resize_params(self):
+        radio = torch.rand(1) * (self.resize_radio[1] - self.resize_radio[0]) + self.resize_radio[0]
+        if isinstance(self.size, int):
+            size = int(self.size * radio)
+        elif isinstance(self.size, Sequence):
+            size = list(self.size)
+            size = (int(size[0] * radio), int(size[1] * radio))
+        else:
+            raise RuntimeError()
+        interpolation = self.allow_resize_interpolations[
+            torch.randint(low=0, high=len(self.allow_resize_interpolations), size=(1,))
+        ]
+        return size, interpolation
+    def forward(self, img):
+        size, interpolation = self.random_resize_params()
+        img = TVF.resize(img, size, interpolation, self.max_size, self.antialias)
+        img = TVF.resize(img, self.size, self.interpolation, self.max_size, self.antialias)
+        return img
+    def __repr__(self) -> str:
+        detail = f"(size={self.size}, interpolation={self.interpolation.value},"
+        detail += f"max_size={self.max_size}, antialias={self.antialias}), resize_radio={self.resize_radio}"
+        return f"{self.__class__.__name__}{detail}"
+class Compose(object):
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, *args):
+        for t in self.transforms:
+            args = t(*args)
+        return args
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+def identity(*args, **kwargs):
+    if len(args) == 1:
+        return args[0]
+    else:
+        return args
+def build_transforms(cfg):
+    if cfg is None:
+        return identity
+    transforms = []
+    for transform_name, cfg_instance in cfg.items():
+        transform_instance = instantiate_from_config(cfg_instance)
+        transforms.append(transform_instance)
+        print(f"Build transform: {transform_instance}")
+    transforms = Compose(transforms)
+    return transforms

michelangelo/data/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# -*- coding: utf-8 -*-
+import torch
+import numpy as np
+def worker_init_fn(_):
+    worker_info = torch.utils.data.get_worker_info()
+    worker_id = worker_info.id
+    # dataset = worker_info.dataset
+    # split_size = dataset.num_records // worker_info.num_workers
+    # # reset num_records to the true number to retain reliable length information
+    # dataset.sample_ids = dataset.valid_ids[worker_id * split_size:(worker_id + 1) * split_size]
+    # current_id = np.random.choice(len(np.random.get_state()[1]), 1)
+    # return np.random.seed(np.random.get_state()[1][current_id] + worker_id)
+    return np.random.seed(np.random.get_state()[1][0] + worker_id)
+def collation_fn(samples, combine_tensors=True, combine_scalars=True):
+    """
+    Args:
+        samples (list[dict]):
+        combine_tensors:
+        combine_scalars:
+    Returns:
+    """
+    result = {}
+    keys = samples[0].keys()
+    for key in keys:
+        result[key] = []
+    for sample in samples:
+        for key in keys:
+            val = sample[key]
+            result[key].append(val)
+    for key in keys:
+        val_list = result[key]
+        if isinstance(val_list[0], (int, float)):
+            if combine_scalars:
+                result[key] = np.array(result[key])
+        elif isinstance(val_list[0], torch.Tensor):
+            if combine_tensors:
+                result[key] = torch.stack(val_list)
+        elif isinstance(val_list[0], np.ndarray):
+            if combine_tensors:
+                result[key] = np.stack(val_list)
+    return result

michelangelo/graphics/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

michelangelo/graphics/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (185 Bytes). View file

michelangelo/graphics/primitives/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# -*- coding: utf-8 -*-
+from .volume import generate_dense_grid_points
+from .mesh import (
+    MeshOutput,
+    save_obj,
+    savemeshtes2
+)

michelangelo/graphics/primitives/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (334 Bytes). View file

michelangelo/graphics/primitives/__pycache__/extract_texture_map.cpython-39.pyc ADDED Viewed

Binary file (2.46 kB). View file

michelangelo/graphics/primitives/__pycache__/mesh.cpython-39.pyc ADDED Viewed

Binary file (2.93 kB). View file

michelangelo/graphics/primitives/__pycache__/volume.cpython-39.pyc ADDED Viewed

Binary file (860 Bytes). View file

michelangelo/graphics/primitives/mesh.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# -*- coding: utf-8 -*-
+import os
+import cv2
+import numpy as np
+import PIL.Image
+from typing import Optional
+import trimesh
+def save_obj(pointnp_px3, facenp_fx3, fname):
+    fid = open(fname, "w")
+    write_str = ""
+    for pidx, p in enumerate(pointnp_px3):
+        pp = p
+        write_str += "v %f %f %f\n" % (pp[0], pp[1], pp[2])
+    for i, f in enumerate(facenp_fx3):
+        f1 = f + 1
+        write_str += "f %d %d %d\n" % (f1[0], f1[1], f1[2])
+    fid.write(write_str)
+    fid.close()
+    return
+def savemeshtes2(pointnp_px3, tcoords_px2, facenp_fx3, facetex_fx3, tex_map, fname):
+    fol, na = os.path.split(fname)
+    na, _ = os.path.splitext(na)
+    matname = "%s/%s.mtl" % (fol, na)
+    fid = open(matname, "w")
+    fid.write("newmtl material_0\n")
+    fid.write("Kd 1 1 1\n")
+    fid.write("Ka 0 0 0\n")
+    fid.write("Ks 0.4 0.4 0.4\n")
+    fid.write("Ns 10\n")
+    fid.write("illum 2\n")
+    fid.write("map_Kd %s.png\n" % na)
+    fid.close()
+    ####
+    fid = open(fname, "w")
+    fid.write("mtllib %s.mtl\n" % na)
+    for pidx, p in enumerate(pointnp_px3):
+        pp = p
+        fid.write("v %f %f %f\n" % (pp[0], pp[1], pp[2]))
+    for pidx, p in enumerate(tcoords_px2):
+        pp = p
+        fid.write("vt %f %f\n" % (pp[0], pp[1]))
+    fid.write("usemtl material_0\n")
+    for i, f in enumerate(facenp_fx3):
+        f1 = f + 1
+        f2 = facetex_fx3[i] + 1
+        fid.write("f %d/%d %d/%d %d/%d\n" % (f1[0], f2[0], f1[1], f2[1], f1[2], f2[2]))
+    fid.close()
+    PIL.Image.fromarray(np.ascontiguousarray(tex_map), "RGB").save(
+        os.path.join(fol, "%s.png" % na))
+    return
+class MeshOutput(object):
+    def __init__(self,
+                 mesh_v: np.ndarray,
+                 mesh_f: np.ndarray,
+                 vertex_colors: Optional[np.ndarray] = None,
+                 uvs: Optional[np.ndarray] = None,
+                 mesh_tex_idx: Optional[np.ndarray] = None,
+                 tex_map: Optional[np.ndarray] = None):
+        self.mesh_v = mesh_v
+        self.mesh_f = mesh_f
+        self.vertex_colors = vertex_colors
+        self.uvs = uvs
+        self.mesh_tex_idx = mesh_tex_idx
+        self.tex_map = tex_map
+    def contain_uv_texture(self):
+        return (self.uvs is not None) and (self.mesh_tex_idx is not None) and (self.tex_map is not None)
+    def contain_vertex_colors(self):
+        return self.vertex_colors is not None
+    def export(self, fname):
+        if self.contain_uv_texture():
+            savemeshtes2(
+                self.mesh_v,
+                self.uvs,
+                self.mesh_f,
+                self.mesh_tex_idx,
+                self.tex_map,
+                fname
+            )
+        elif self.contain_vertex_colors():
+            mesh_obj = trimesh.Trimesh(vertices=self.mesh_v, faces=self.mesh_f, vertex_colors=self.vertex_colors)
+            mesh_obj.export(fname)
+        else:
+            save_obj(
+                self.mesh_v,
+                self.mesh_f,
+                fname
+            )

michelangelo/graphics/primitives/volume.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# -*- coding: utf-8 -*-
+import numpy as np
+def generate_dense_grid_points(bbox_min: np.ndarray,
+                               bbox_max: np.ndarray,
+                               octree_depth: int,
+                               indexing: str = "ij"):
+    length = bbox_max - bbox_min
+    num_cells = np.exp2(octree_depth)
+    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
+    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
+    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyz = np.stack((xs, ys, zs), axis=-1)
+    xyz = xyz.reshape(-1, 3)
+    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+    return xyz, grid_size, length

michelangelo/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

michelangelo/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (183 Bytes). View file

michelangelo/models/asl_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # -- coding: utf-8 --

michelangelo/models/asl_diffusion/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (197 Bytes). View file

michelangelo/models/asl_diffusion/__pycache__/asl_udt.cpython-39.pyc ADDED Viewed

Binary file (2.64 kB). View file

michelangelo/models/asl_diffusion/__pycache__/clip_asl_diffuser_pl_module.cpython-39.pyc ADDED Viewed

Binary file (9.87 kB). View file

michelangelo/models/asl_diffusion/__pycache__/inference_utils.cpython-39.pyc ADDED Viewed

Binary file (1.75 kB). View file

michelangelo/models/asl_diffusion/asl_diffuser_pl_module.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# -*- coding: utf-8 -*-
+from omegaconf import DictConfig
+from typing import List, Tuple, Dict, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import lr_scheduler
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_only
+from einops import rearrange
+from diffusers.schedulers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    KarrasVeScheduler,
+    DPMSolverMultistepScheduler
+)
+from michelangelo.utils import instantiate_from_config
+# from michelangelo.models.tsal.tsal_base import ShapeAsLatentPLModule
+from michelangelo.models.tsal.tsal_base import AlignedShapeAsLatentPLModule
+from michelangelo.models.asl_diffusion.inference_utils import ddim_sample
+SchedulerType = Union[DDIMScheduler, KarrasVeScheduler, DPMSolverMultistepScheduler]
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class ASLDiffuser(pl.LightningModule):
+    first_stage_model: Optional[AlignedShapeAsLatentPLModule]
+    # cond_stage_model: Optional[Union[nn.Module, pl.LightningModule]]
+    model: nn.Module
+    def __init__(self, *,
+                 first_stage_config,
+                 denoiser_cfg,
+                 scheduler_cfg,
+                 optimizer_cfg,
+                 loss_cfg,
+                 first_stage_key: str = "surface",
+                 cond_stage_key: str = "image",
+                 cond_stage_trainable: bool = True,
+                 scale_by_std: bool = False,
+                 z_scale_factor: float = 1.0,
+                 ckpt_path: Optional[str] = None,
+                 ignore_keys: Union[Tuple[str], List[str]] = ()):
+        super().__init__()
+        self.first_stage_key = first_stage_key
+        self.cond_stage_key = cond_stage_key
+        self.cond_stage_trainable = cond_stage_trainable
+        # 1. initialize first stage.
+        # Note: the condition model contained in the first stage model.
+        self.first_stage_config = first_stage_config
+        self.first_stage_model = None
+        # self.instantiate_first_stage(first_stage_config)
+        # 2. initialize conditional stage
+        # self.instantiate_cond_stage(cond_stage_config)
+        self.cond_stage_model = {
+            "image": self.encode_image,
+            "image_unconditional_embedding": self.empty_img_cond,
+            "text": self.encode_text,
+            "text_unconditional_embedding": self.empty_text_cond,
+            "surface": self.encode_surface,
+            "surface_unconditional_embedding": self.empty_surface_cond,
+        }
+        # 3. diffusion model
+        self.model = instantiate_from_config(
+            denoiser_cfg, device=None, dtype=None
+        )
+        self.optimizer_cfg = optimizer_cfg
+        # 4. scheduling strategy
+        self.scheduler_cfg = scheduler_cfg
+        self.noise_scheduler: DDPMScheduler = instantiate_from_config(scheduler_cfg.noise)
+        self.denoise_scheduler: SchedulerType = instantiate_from_config(scheduler_cfg.denoise)
+        # 5. loss configures
+        self.loss_cfg = loss_cfg
+        self.scale_by_std = scale_by_std
+        if scale_by_std:
+            self.register_buffer("z_scale_factor", torch.tensor(z_scale_factor))
+        else:
+            self.z_scale_factor = z_scale_factor
+        self.ckpt_path = ckpt_path
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def instantiate_first_stage(self, config):
+        model = instantiate_from_config(config)
+        self.first_stage_model = model.eval()
+        self.first_stage_model.train = disabled_train
+        for param in self.first_stage_model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = self.first_stage_model.to(self.device)
+    # def instantiate_cond_stage(self, config):
+    #     if not self.cond_stage_trainable:
+    #         if config == "__is_first_stage__":
+    #             print("Using first stage also as cond stage.")
+    #             self.cond_stage_model = self.first_stage_model
+    #         elif config == "__is_unconditional__":
+    #             print(f"Training {self.__class__.__name__} as an unconditional model.")
+    #             self.cond_stage_model = None
+    #             # self.be_unconditional = True
+    #         else:
+    #             model = instantiate_from_config(config)
+    #             self.cond_stage_model = model.eval()
+    #             self.cond_stage_model.train = disabled_train
+    #             for param in self.cond_stage_model.parameters():
+    #                 param.requires_grad = False
+    #     else:
+    #         assert config != "__is_first_stage__"
+    #         assert config != "__is_unconditional__"
+    #         model = instantiate_from_config(config)
+    #         self.cond_stage_model = model
+    def init_from_ckpt(self, path, ignore_keys=()):
+        state_dict = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(state_dict.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del state_dict[k]
+        missing, unexpected = self.load_state_dict(state_dict, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+    @property
+    def zero_rank(self):
+        if self._trainer:
+            zero_rank = self.trainer.local_rank == 0
+        else:
+            zero_rank = True
+        return zero_rank
+    def configure_optimizers(self) -> Tuple[List, List]:
+        lr = self.learning_rate
+        trainable_parameters = list(self.model.parameters())
+        # if the conditional encoder is trainable
+        # if self.cond_stage_trainable:
+        #     conditioner_params = [p for p in self.cond_stage_model.parameters() if p.requires_grad]
+        #     trainable_parameters += conditioner_params
+        #     print(f"number of trainable conditional parameters: {len(conditioner_params)}.")
+        if self.optimizer_cfg is None:
+            optimizers = [torch.optim.AdamW(trainable_parameters, lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
+            schedulers = []
+        else:
+            optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=trainable_parameters)
+            scheduler_func = instantiate_from_config(
+                self.optimizer_cfg.scheduler,
+                max_decay_steps=self.trainer.max_steps,
+                lr_max=lr
+            )
+            scheduler = {
+                "scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
+                "interval": "step",
+                "frequency": 1
+            }
+            optimizers = [optimizer]
+            schedulers = [scheduler]
+        return optimizers, schedulers
+    @torch.no_grad()
+    def encode_text(self, text):
+        b = text.shape[0]
+        text_tokens = rearrange(text, "b t l -> (b t) l")
+        text_embed = self.first_stage_model.model.encode_text_embed(text_tokens)
+        text_embed = rearrange(text_embed, "(b t) d -> b t d", b=b)
+        text_embed = text_embed.mean(dim=1)
+        text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+        return text_embed
+    @torch.no_grad()
+    def encode_image(self, img):
+        return self.first_stage_model.model.encode_image_embed(img)
+    @torch.no_grad()
+    def encode_surface(self, surface):
+        return self.first_stage_model.model.encode_shape_embed(surface, return_latents=False)
+    @torch.no_grad()
+    def empty_text_cond(self, cond):
+        return torch.zeros_like(cond, device=cond.device)
+    @torch.no_grad()
+    def empty_img_cond(self, cond):
+        return torch.zeros_like(cond, device=cond.device)
+    @torch.no_grad()
+    def empty_surface_cond(self, cond):
+        return torch.zeros_like(cond, device=cond.device)
+    @torch.no_grad()
+    def encode_first_stage(self, surface: torch.FloatTensor, sample_posterior=True):
+        z_q = self.first_stage_model.encode(surface, sample_posterior)
+        z_q = self.z_scale_factor * z_q
+        return z_q
+    @torch.no_grad()
+    def decode_first_stage(self, z_q: torch.FloatTensor, **kwargs):
+        z_q = 1. / self.z_scale_factor * z_q
+        latents = self.first_stage_model.decode(z_q, **kwargs)
+        return latents
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx):
+        # only for very first batch
+        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 \
+                and batch_idx == 0 and self.ckpt_path is None:
+            # set rescale weight to 1./std of encodings
+            print("### USING STD-RESCALING ###")
+            z_q = self.encode_first_stage(batch[self.first_stage_key])
+            z = z_q.detach()
+            del self.z_scale_factor
+            self.register_buffer("z_scale_factor", 1. / z.flatten().std())
+            print(f"setting self.z_scale_factor to {self.z_scale_factor}")
+            print("### USING STD-RESCALING ###")
+    def compute_loss(self, model_outputs, split):
+        """
+        Args:
+            model_outputs (dict):
+                - x_0:
+                - noise:
+                - noise_prior:
+                - noise_pred:
+                - noise_pred_prior:
+            split (str):
+        Returns:
+        """
+        pred = model_outputs["pred"]
+        if self.noise_scheduler.prediction_type == "epsilon":
+            target = model_outputs["noise"]
+        elif self.noise_scheduler.prediction_type == "sample":
+            target = model_outputs["x_0"]
+        else:
+            raise NotImplementedError(f"Prediction Type: {self.noise_scheduler.prediction_type} not yet supported.")
+        if self.loss_cfg.loss_type == "l1":
+            simple = F.l1_loss(pred, target, reduction="mean")
+        elif self.loss_cfg.loss_type in ["mse", "l2"]:
+            simple = F.mse_loss(pred, target, reduction="mean")
+        else:
+            raise NotImplementedError(f"Loss Type: {self.loss_cfg.loss_type} not yet supported.")
+        total_loss = simple
+        loss_dict = {
+            f"{split}/total_loss": total_loss.clone().detach(),
+            f"{split}/simple": simple.detach(),
+        }
+        return total_loss, loss_dict
+    def forward(self, batch):
+        """
+        Args:
+            batch:
+        Returns:
+        """
+        if self.first_stage_model is None:
+            self.instantiate_first_stage(self.first_stage_config)
+        latents = self.encode_first_stage(batch[self.first_stage_key])
+        # conditions = self.cond_stage_model.encode(batch[self.cond_stage_key])
+        conditions = self.cond_stage_model[self.cond_stage_key](batch[self.cond_stage_key]).unsqueeze(1)
+        mask = torch.rand((len(conditions), 1, 1), device=conditions.device, dtype=conditions.dtype) >= 0.1
+        conditions = conditions * mask.to(conditions)
+        # Sample noise that we"ll add to the latents
+        # [batch_size, n_token, latent_dim]
+        noise = torch.randn_like(latents)
+        bs = latents.shape[0]
+        # Sample a random timestep for each motion
+        timesteps = torch.randint(
+            0,
+            self.noise_scheduler.config.num_train_timesteps,
+            (bs,),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        noisy_z = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        # diffusion model forward
+        noise_pred = self.model(noisy_z, timesteps, conditions)
+        diffusion_outputs = {
+            "x_0": noisy_z,
+            "noise": noise,
+            "pred": noise_pred
+        }
+        return diffusion_outputs
+    def training_step(self, batch: Dict[str, Union[torch.FloatTensor, List[str]]],
+                      batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
+        """
+        Args:
+            batch (dict): the batch sample, and it contains:
+                - surface (torch.FloatTensor):
+                - image (torch.FloatTensor): if provide, [bs, 3, h, w], item range [0, 1]
+                - depth (torch.FloatTensor): if provide, [bs, 1, h, w], item range [-1, 1]
+                - normal (torch.FloatTensor): if provide, [bs, 3, h, w], item range [-1, 1]
+                - text (list of str):
+            batch_idx (int):
+            optimizer_idx (int):
+        Returns:
+            loss (torch.FloatTensor):
+        """
+        diffusion_outputs = self(batch)
+        loss, loss_dict = self.compute_loss(diffusion_outputs, "train")
+        self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
+        return loss
+    def validation_step(self, batch: Dict[str, torch.FloatTensor],
+                        batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
+        """
+        Args:
+            batch (dict): the batch sample, and it contains:
+                - surface_pc (torch.FloatTensor): [n_pts, 4]
+                - surface_feats (torch.FloatTensor): [n_pts, c]
+                - text (list of str):
+            batch_idx (int):
+            optimizer_idx (int):
+        Returns:
+            loss (torch.FloatTensor):
+        """
+        diffusion_outputs = self(batch)
+        loss, loss_dict = self.compute_loss(diffusion_outputs, "val")
+        self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
+        return loss
+    @torch.no_grad()
+    def sample(self,
+               batch: Dict[str, Union[torch.FloatTensor, List[str]]],
+               sample_times: int = 1,
+               steps: Optional[int] = None,
+               guidance_scale: Optional[float] = None,
+               eta: float = 0.0,
+               return_intermediates: bool = False, **kwargs):
+        if self.first_stage_model is None:
+            self.instantiate_first_stage(self.first_stage_config)
+        if steps is None:
+            steps = self.scheduler_cfg.num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.scheduler_cfg.guidance_scale
+        do_classifier_free_guidance = guidance_scale > 0
+        # conditional encode
+        xc = batch[self.cond_stage_key]
+        # cond = self.cond_stage_model[self.cond_stage_key](xc)
+        cond = self.cond_stage_model[self.cond_stage_key](xc).unsqueeze(1)
+        if do_classifier_free_guidance:
+            """
+            Note: There are two kinds of uncond for text.
+            1: using "" as uncond text; (in SAL diffusion)
+            2: zeros_like(cond) as uncond text; (in MDM)
+            """
+            # un_cond = self.cond_stage_model.unconditional_embedding(batch_size=len(xc))
+            un_cond = self.cond_stage_model[f"{self.cond_stage_key}_unconditional_embedding"](cond)
+            # un_cond = torch.zeros_like(cond, device=cond.device)
+            cond = torch.cat([un_cond, cond], dim=0)
+        outputs = []
+        latents = None
+        if not return_intermediates:
+            for _ in range(sample_times):
+                sample_loop = ddim_sample(
+                    self.denoise_scheduler,
+                    self.model,
+                    shape=self.first_stage_model.latent_shape,
+                    cond=cond,
+                    steps=steps,
+                    guidance_scale=guidance_scale,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    device=self.device,
+                    eta=eta,
+                    disable_prog=not self.zero_rank
+                )
+                for sample, t in sample_loop:
+                    latents = sample
+                outputs.append(self.decode_first_stage(latents, **kwargs))
+        else:
+            sample_loop = ddim_sample(
+                self.denoise_scheduler,
+                self.model,
+                shape=self.first_stage_model.latent_shape,
+                cond=cond,
+                steps=steps,
+                guidance_scale=guidance_scale,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                device=self.device,
+                eta=eta,
+                disable_prog=not self.zero_rank
+            )
+            iter_size = steps // sample_times
+            i = 0
+            for sample, t in sample_loop:
+                latents = sample
+                if i % iter_size == 0 or i == steps - 1:
+                    outputs.append(self.decode_first_stage(latents, **kwargs))
+                i += 1
+        return outputs

michelangelo/models/asl_diffusion/asl_udt.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from typing import Optional
+from diffusers.models.embeddings import Timesteps
+import math
+from michelangelo.models.modules.transformer_blocks import MLP
+from michelangelo.models.modules.diffusion_transformer import UNetDiffusionTransformer
+class ConditionalASLUDTDenoiser(nn.Module):
+    def __init__(self, *,
+                 device: Optional[torch.device],
+                 dtype: Optional[torch.dtype],
+                 input_channels: int,
+                 output_channels: int,
+                 n_ctx: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 context_dim: int,
+                 context_ln: bool = True,
+                 skip_ln: bool = False,
+                 init_scale: float = 0.25,
+                 flip_sin_to_cos: bool = False,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        init_scale = init_scale * math.sqrt(1.0 / width)
+        self.backbone = UNetDiffusionTransformer(
+            device=device,
+            dtype=dtype,
+            n_ctx=n_ctx,
+            width=width,
+            layers=layers,
+            heads=heads,
+            skip_ln=skip_ln,
+            init_scale=init_scale,
+            use_checkpoint=use_checkpoint
+        )
+        self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
+        self.input_proj = nn.Linear(input_channels, width, device=device, dtype=dtype)
+        self.output_proj = nn.Linear(width, output_channels, device=device, dtype=dtype)
+        # timestep embedding
+        self.time_embed = Timesteps(width, flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=0)
+        self.time_proj = MLP(
+            device=device, dtype=dtype, width=width, init_scale=init_scale
+        )
+        self.context_embed = nn.Sequential(
+            nn.LayerNorm(context_dim, device=device, dtype=dtype),
+            nn.Linear(context_dim, width, device=device, dtype=dtype),
+        )
+        if context_ln:
+            self.context_embed = nn.Sequential(
+                nn.LayerNorm(context_dim, device=device, dtype=dtype),
+                nn.Linear(context_dim, width, device=device, dtype=dtype),
+            )
+        else:
+            self.context_embed = nn.Linear(context_dim, width, device=device, dtype=dtype)
+    def forward(self,
+                model_input: torch.FloatTensor,
+                timestep: torch.LongTensor,
+                context: torch.FloatTensor):
+        r"""
+        Args:
+            model_input (torch.FloatTensor): [bs, n_data, c]
+            timestep (torch.LongTensor): [bs,]
+            context (torch.FloatTensor): [bs, context_tokens, c]
+        Returns:
+            sample (torch.FloatTensor): [bs, n_data, c]
+        """
+        _, n_data, _ = model_input.shape
+        # 1. time
+        t_emb = self.time_proj(self.time_embed(timestep)).unsqueeze(dim=1)
+        # 2. conditions projector
+        context = self.context_embed(context)
+        # 3. denoiser
+        x = self.input_proj(model_input)
+        x = torch.cat([t_emb, context, x], dim=1)
+        x = self.backbone(x)
+        x = self.ln_post(x)
+        x = x[:, -n_data:]
+        sample = self.output_proj(x)
+        return sample

michelangelo/models/asl_diffusion/base.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+class BaseDenoiser(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, t, context):
+        raise NotImplementedError

michelangelo/models/asl_diffusion/clip_asl_diffuser_pl_module.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# -*- coding: utf-8 -*-
+from omegaconf import DictConfig
+from typing import List, Tuple, Dict, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import lr_scheduler
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_only
+from diffusers.schedulers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    KarrasVeScheduler,
+    DPMSolverMultistepScheduler
+)
+from michelangelo.utils import instantiate_from_config
+from michelangelo.models.tsal.tsal_base import AlignedShapeAsLatentPLModule
+from michelangelo.models.asl_diffusion.inference_utils import ddim_sample
+SchedulerType = Union[DDIMScheduler, KarrasVeScheduler, DPMSolverMultistepScheduler]
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class ClipASLDiffuser(pl.LightningModule):
+    first_stage_model: Optional[AlignedShapeAsLatentPLModule]
+    cond_stage_model: Optional[Union[nn.Module, pl.LightningModule]]
+    model: nn.Module
+    def __init__(self, *,
+                 first_stage_config,
+                 cond_stage_config,
+                 denoiser_cfg,
+                 scheduler_cfg,
+                 optimizer_cfg,
+                 loss_cfg,
+                 first_stage_key: str = "surface",
+                 cond_stage_key: str = "image",
+                 scale_by_std: bool = False,
+                 z_scale_factor: float = 1.0,
+                 ckpt_path: Optional[str] = None,
+                 ignore_keys: Union[Tuple[str], List[str]] = ()):
+        super().__init__()
+        self.first_stage_key = first_stage_key
+        self.cond_stage_key = cond_stage_key
+        # 1. lazy initialize first stage
+        self.instantiate_first_stage(first_stage_config)
+        # 2. initialize conditional stage
+        self.instantiate_cond_stage(cond_stage_config)
+        # 3. diffusion model
+        self.model = instantiate_from_config(
+            denoiser_cfg, device=None, dtype=None
+        )
+        self.optimizer_cfg = optimizer_cfg
+        # 4. scheduling strategy
+        self.scheduler_cfg = scheduler_cfg
+        self.noise_scheduler: DDPMScheduler = instantiate_from_config(scheduler_cfg.noise)
+        self.denoise_scheduler: SchedulerType = instantiate_from_config(scheduler_cfg.denoise)
+        # 5. loss configures
+        self.loss_cfg = loss_cfg
+        self.scale_by_std = scale_by_std
+        if scale_by_std:
+            self.register_buffer("z_scale_factor", torch.tensor(z_scale_factor))
+        else:
+            self.z_scale_factor = z_scale_factor
+        self.ckpt_path = ckpt_path
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def instantiate_non_trainable_model(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        return model
+    def instantiate_first_stage(self, first_stage_config):
+        self.first_stage_model = self.instantiate_non_trainable_model(first_stage_config)
+        self.first_stage_model.set_shape_model_only()
+    def instantiate_cond_stage(self, cond_stage_config):
+        self.cond_stage_model = self.instantiate_non_trainable_model(cond_stage_config)
+    def init_from_ckpt(self, path, ignore_keys=()):
+        state_dict = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(state_dict.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del state_dict[k]
+        missing, unexpected = self.load_state_dict(state_dict, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+    @property
+    def zero_rank(self):
+        if self._trainer:
+            zero_rank = self.trainer.local_rank == 0
+        else:
+            zero_rank = True
+        return zero_rank
+    def configure_optimizers(self) -> Tuple[List, List]:
+        lr = self.learning_rate
+        trainable_parameters = list(self.model.parameters())
+        if self.optimizer_cfg is None:
+            optimizers = [torch.optim.AdamW(trainable_parameters, lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
+            schedulers = []
+        else:
+            optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=trainable_parameters)
+            scheduler_func = instantiate_from_config(
+                self.optimizer_cfg.scheduler,
+                max_decay_steps=self.trainer.max_steps,
+                lr_max=lr
+            )
+            scheduler = {
+                "scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
+                "interval": "step",
+                "frequency": 1
+            }
+            optimizers = [optimizer]
+            schedulers = [scheduler]
+        return optimizers, schedulers
+    @torch.no_grad()
+    def encode_first_stage(self, surface: torch.FloatTensor, sample_posterior=True):
+        z_q = self.first_stage_model.encode(surface, sample_posterior)
+        z_q = self.z_scale_factor * z_q
+        return z_q
+    @torch.no_grad()
+    def decode_first_stage(self, z_q: torch.FloatTensor, **kwargs):
+        z_q = 1. / self.z_scale_factor * z_q
+        latents = self.first_stage_model.decode(z_q, **kwargs)
+        return latents
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx):
+        # only for very first batch
+        if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 \
+                and batch_idx == 0 and self.ckpt_path is None:
+            # set rescale weight to 1./std of encodings
+            print("### USING STD-RESCALING ###")
+            z_q = self.encode_first_stage(batch[self.first_stage_key])
+            z = z_q.detach()
+            del self.z_scale_factor
+            self.register_buffer("z_scale_factor", 1. / z.flatten().std())
+            print(f"setting self.z_scale_factor to {self.z_scale_factor}")
+            print("### USING STD-RESCALING ###")
+    def compute_loss(self, model_outputs, split):
+        """
+        Args:
+            model_outputs (dict):
+                - x_0:
+                - noise:
+                - noise_prior:
+                - noise_pred:
+                - noise_pred_prior:
+            split (str):
+        Returns:
+        """
+        pred = model_outputs["pred"]
+        if self.noise_scheduler.prediction_type == "epsilon":
+            target = model_outputs["noise"]
+        elif self.noise_scheduler.prediction_type == "sample":
+            target = model_outputs["x_0"]
+        else:
+            raise NotImplementedError(f"Prediction Type: {self.noise_scheduler.prediction_type} not yet supported.")
+        if self.loss_cfg.loss_type == "l1":
+            simple = F.l1_loss(pred, target, reduction="mean")
+        elif self.loss_cfg.loss_type in ["mse", "l2"]:
+            simple = F.mse_loss(pred, target, reduction="mean")
+        else:
+            raise NotImplementedError(f"Loss Type: {self.loss_cfg.loss_type} not yet supported.")
+        total_loss = simple
+        loss_dict = {
+            f"{split}/total_loss": total_loss.clone().detach(),
+            f"{split}/simple": simple.detach(),
+        }
+        return total_loss, loss_dict
+    def forward(self, batch):
+        """
+        Args:
+            batch:
+        Returns:
+        """
+        latents = self.encode_first_stage(batch[self.first_stage_key])
+        conditions = self.cond_stage_model.encode(batch[self.cond_stage_key])
+        # Sample noise that we"ll add to the latents
+        # [batch_size, n_token, latent_dim]
+        noise = torch.randn_like(latents)
+        bs = latents.shape[0]
+        # Sample a random timestep for each motion
+        timesteps = torch.randint(
+            0,
+            self.noise_scheduler.config.num_train_timesteps,
+            (bs,),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        # Add noise to the latents according to the noise magnitude at each timestep
+        noisy_z = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        # diffusion model forward
+        noise_pred = self.model(noisy_z, timesteps, conditions)
+        diffusion_outputs = {
+            "x_0": noisy_z,
+            "noise": noise,
+            "pred": noise_pred
+        }
+        return diffusion_outputs
+    def training_step(self, batch: Dict[str, Union[torch.FloatTensor, List[str]]],
+                      batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
+        """
+        Args:
+            batch (dict): the batch sample, and it contains:
+                - surface (torch.FloatTensor):
+                - image (torch.FloatTensor): if provide, [bs, 3, h, w], item range [0, 1]
+                - depth (torch.FloatTensor): if provide, [bs, 1, h, w], item range [-1, 1]
+                - normal (torch.FloatTensor): if provide, [bs, 3, h, w], item range [-1, 1]
+                - text (list of str):
+            batch_idx (int):
+            optimizer_idx (int):
+        Returns:
+            loss (torch.FloatTensor):
+        """
+        diffusion_outputs = self(batch)
+        loss, loss_dict = self.compute_loss(diffusion_outputs, "train")
+        self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
+        return loss
+    def validation_step(self, batch: Dict[str, torch.FloatTensor],
+                        batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
+        """
+        Args:
+            batch (dict): the batch sample, and it contains:
+                - surface_pc (torch.FloatTensor): [n_pts, 4]
+                - surface_feats (torch.FloatTensor): [n_pts, c]
+                - text (list of str):
+            batch_idx (int):
+            optimizer_idx (int):
+        Returns:
+            loss (torch.FloatTensor):
+        """
+        diffusion_outputs = self(batch)
+        loss, loss_dict = self.compute_loss(diffusion_outputs, "val")
+        self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
+        return loss
+    @torch.no_grad()
+    def sample(self,
+               batch: Dict[str, Union[torch.FloatTensor, List[str]]],
+               sample_times: int = 1,
+               steps: Optional[int] = None,
+               guidance_scale: Optional[float] = None,
+               eta: float = 0.0,
+               return_intermediates: bool = False, **kwargs):
+        if steps is None:
+            steps = self.scheduler_cfg.num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.scheduler_cfg.guidance_scale
+        do_classifier_free_guidance = guidance_scale > 0
+        # conditional encode
+        xc = batch[self.cond_stage_key]
+        # print(self.first_stage_model.device, self.cond_stage_model.device, self.device)
+        cond = self.cond_stage_model(xc)
+        if do_classifier_free_guidance:
+            un_cond = self.cond_stage_model.unconditional_embedding(batch_size=len(xc))
+            cond = torch.cat([un_cond, cond], dim=0)
+        outputs = []
+        latents = None
+        if not return_intermediates:
+            for _ in range(sample_times):
+                sample_loop = ddim_sample(
+                    self.denoise_scheduler,
+                    self.model,
+                    shape=self.first_stage_model.latent_shape,
+                    cond=cond,
+                    steps=steps,
+                    guidance_scale=guidance_scale,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    device=self.device,
+                    eta=eta,
+                    disable_prog=not self.zero_rank
+                )
+                for sample, t in sample_loop:
+                    latents = sample
+                outputs.append(self.decode_first_stage(latents, **kwargs))
+        else:
+            sample_loop = ddim_sample(
+                self.denoise_scheduler,
+                self.model,
+                shape=self.first_stage_model.latent_shape,
+                cond=cond,
+                steps=steps,
+                guidance_scale=guidance_scale,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                device=self.device,
+                eta=eta,
+                disable_prog=not self.zero_rank
+            )
+            iter_size = steps // sample_times
+            i = 0
+            for sample, t in sample_loop:
+                latents = sample
+                if i % iter_size == 0 or i == steps - 1:
+                    outputs.append(self.decode_first_stage(latents, **kwargs))
+                i += 1
+        return outputs

michelangelo/models/asl_diffusion/inference_utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# -*- coding: utf-8 -*-
+import torch
+from tqdm import tqdm
+from typing import Tuple, List, Union, Optional
+from diffusers.schedulers import DDIMScheduler
+__all__ = ["ddim_sample"]
+def ddim_sample(ddim_scheduler: DDIMScheduler,
+                diffusion_model: torch.nn.Module,
+                shape: Union[List[int], Tuple[int]],
+                cond: torch.FloatTensor,
+                steps: int,
+                eta: float = 0.0,
+                guidance_scale: float = 3.0,
+                do_classifier_free_guidance: bool = True,
+                generator: Optional[torch.Generator] = None,
+                device: torch.device = "cuda:0",
+                disable_prog: bool = True):
+    assert steps > 0, f"{steps} must > 0."
+    # init latents
+    bsz = cond.shape[0]
+    if do_classifier_free_guidance:
+        bsz = bsz // 2
+    latents = torch.randn(
+        (bsz, *shape),
+        generator=generator,
+        device=cond.device,
+        dtype=cond.dtype,
+    )
+    # scale the initial noise by the standard deviation required by the scheduler
+    latents = latents * ddim_scheduler.init_noise_sigma
+    # set timesteps
+    ddim_scheduler.set_timesteps(steps)
+    timesteps = ddim_scheduler.timesteps.to(device)
+    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+    # eta (η) is only used with the DDIMScheduler, and between [0, 1]
+    extra_step_kwargs = {
+        "eta": eta,
+        "generator": generator
+    }
+    # reverse
+    for i, t in enumerate(tqdm(timesteps, disable=disable_prog, desc="DDIM Sampling:", leave=False)):
+        # expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2)
+            if do_classifier_free_guidance
+            else latents
+        )
+        # latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+        # predict the noise residual
+        timestep_tensor = torch.tensor([t], dtype=torch.long, device=device)
+        timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
+        noise_pred = diffusion_model.forward(latent_model_input, timestep_tensor, cond)
+        # perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+            )
+            # text_embeddings_for_guidance = encoder_hidden_states.chunk(
+            #     2)[1] if do_classifier_free_guidance else encoder_hidden_states
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = ddim_scheduler.step(
+            noise_pred, t, latents, **extra_step_kwargs
+        ).prev_sample
+        yield latents, t
+def karra_sample():
+    pass

michelangelo/models/conditional_encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # -- coding: utf-8 --
2	+
3	+ from .clip import CLIPEncoder