Spaces:

wliu88
/

StructDiffusionDemo

Paused

App Files Files Community

Weiyu Liu commited on Jul 10, 2023

Commit

8c02843

1 Parent(s): a77a4ae

add demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +131 -0
configs/base.yaml +3 -0
configs/conditional_pose_diffusion.yaml +81 -0
configs/pairwise_collision.yaml +42 -0
data/data00000000.h5 +3 -0
data/data00000002.h5 +3 -0
data/data00000003.h5 +3 -0
data/data00000004.h5 +3 -0
data/data00000006.h5 +3 -0
data/data00000008.h5 +3 -0
data/data00000009.h5 +3 -0
data/data00000012.h5 +3 -0
data/data00000013.h5 +3 -0
data/data00000015.h5 +3 -0
data/type_vocabs_coarse.json +1 -0
packages.txt +1 -0
requirements.txt +13 -0
scripts/infer.py +78 -0
scripts/infer_with_discriminator.py +81 -0
scripts/train_discriminator.py +46 -0
scripts/train_generator.py +49 -0
src/StructDiffusion/__init__.py +0 -0
src/StructDiffusion/__pycache__/__init__.cpython-37.pyc +0 -0
src/StructDiffusion/__pycache__/__init__.cpython-38.pyc +0 -0
src/StructDiffusion/data/__init__.py +0 -0
src/StructDiffusion/data/__pycache__/__init__.cpython-37.pyc +0 -0
src/StructDiffusion/data/__pycache__/__init__.cpython-38.pyc +0 -0
src/StructDiffusion/data/__pycache__/pairwise_collision.cpython-37.pyc +0 -0
src/StructDiffusion/data/__pycache__/semantic_arrangement.cpython-37.pyc +0 -0
src/StructDiffusion/data/__pycache__/semantic_arrangement.cpython-38.pyc +0 -0
src/StructDiffusion/data/__pycache__/semantic_arrangement_demo.cpython-38.pyc +0 -0
src/StructDiffusion/data/pairwise_collision.py +361 -0
src/StructDiffusion/data/semantic_arrangement.py +579 -0
src/StructDiffusion/data/semantic_arrangement_demo.py +563 -0
src/StructDiffusion/diffusion/__init__.py +0 -0
src/StructDiffusion/diffusion/__pycache__/__init__.cpython-37.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/__init__.cpython-38.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/noise_schedule.cpython-37.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/noise_schedule.cpython-38.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/pose_conversion.cpython-37.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/pose_conversion.cpython-38.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/sampler.cpython-37.pyc +0 -0
src/StructDiffusion/diffusion/__pycache__/sampler.cpython-38.pyc +0 -0
src/StructDiffusion/diffusion/noise_schedule.py +81 -0
src/StructDiffusion/diffusion/pose_conversion.py +103 -0
src/StructDiffusion/diffusion/sampler.py +296 -0
src/StructDiffusion/language/__init__.py +0 -0
src/StructDiffusion/language/__pycache__/__init__.cpython-37.pyc +0 -0
src/StructDiffusion/language/__pycache__/__init__.cpython-38.pyc +0 -0
src/StructDiffusion/language/__pycache__/tokenizer.cpython-37.pyc +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import argparse
+import torch
+import trimesh
+import numpy as np
+import pytorch_lightning as pl
+import gradio as gr
+from omegaconf import OmegaConf
+import sys
+sys.path.append('./src')
+from StructDiffusion.data.semantic_arrangement_demo import SemanticArrangementDataset
+from StructDiffusion.language.tokenizer import Tokenizer
+from StructDiffusion.models.pl_models import ConditionalPoseDiffusionModel
+from StructDiffusion.diffusion.sampler import Sampler
+from StructDiffusion.diffusion.pose_conversion import get_struct_objs_poses
+from StructDiffusion.utils.files import get_checkpoint_path_from_dir
+from StructDiffusion.utils.batch_inference import move_pc_and_create_scene_simple, visualize_batch_pcs
+from StructDiffusion.utils.rearrangement import show_pcs_with_trimesh
+class Infer_Wrapper:
+    def __init__(self, args, cfg):
+        # load
+        pl.seed_everything(args.eval_random_seed)
+        self.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
+        checkpoint_dir = os.path.join(cfg.WANDB.save_dir, cfg.WANDB.project, args.checkpoint_id, "checkpoints")
+        checkpoint_path = get_checkpoint_path_from_dir(checkpoint_dir)
+        self.tokenizer = Tokenizer(cfg.DATASET.vocab_dir)
+        # override ignore_rgb for visualization
+        cfg.DATASET.ignore_rgb = False
+        self.dataset = SemanticArrangementDataset(tokenizer=self.tokenizer, **cfg.DATASET)
+        self.sampler = Sampler(ConditionalPoseDiffusionModel, checkpoint_path, self.device)
+    def run(self, di):
+        # di = np.random.choice(len(self.dataset))
+        raw_datum = self.dataset.get_raw_data(di)
+        print(self.tokenizer.convert_structure_params_to_natural_language(raw_datum["sentence"]))
+        datum = self.dataset.convert_to_tensors(raw_datum, self.tokenizer)
+        batch = self.dataset.single_datum_to_batch(datum, args.num_samples, self.device, inference_mode=True)
+        num_poses = datum["goal_poses"].shape[0]
+        xs = self.sampler.sample(batch, num_poses)
+        struct_pose, pc_poses_in_struct = get_struct_objs_poses(xs[0])
+        new_obj_xyzs = move_pc_and_create_scene_simple(batch["pcs"], struct_pose, pc_poses_in_struct)
+        # vis
+        vis_obj_xyzs = new_obj_xyzs[:3]
+        if torch.is_tensor(vis_obj_xyzs):
+            if vis_obj_xyzs.is_cuda:
+                vis_obj_xyzs = vis_obj_xyzs.detach().cpu()
+            vis_obj_xyzs = vis_obj_xyzs.numpy()
+        # for bi, vis_obj_xyz in enumerate(vis_obj_xyzs):
+        #     if verbose:
+        #         print("example {}".format(bi))
+        #         print(vis_obj_xyz.shape)
+        #
+        #     if trimesh:
+        #         show_pcs_with_trimesh([xyz[:, :3] for xyz in vis_obj_xyz], [xyz[:, 3:] for xyz in vis_obj_xyz])
+        vis_obj_xyz = vis_obj_xyzs[0]
+        scene = show_pcs_with_trimesh([xyz[:, :3] for xyz in vis_obj_xyz], [xyz[:, 3:] for xyz in vis_obj_xyz], return_scene=True)
+        scene_filename = "./tmp_data/scene.glb"
+        scene.export(scene_filename)
+        # pc_filename = "/home/weiyu/Research/StructDiffusion/StructDiffusion/interactive_demo/tmp_data/pc.glb"
+        # scene_filename = "/home/weiyu/Research/StructDiffusion/StructDiffusion/interactive_demo/tmp_data/scene.glb"
+        #
+        # vis_obj_xyz = vis_obj_xyz.reshape(-1, 6)
+        # vis_pc = trimesh.PointCloud(vis_obj_xyz[:, :3], colors=np.concatenate([vis_obj_xyz[:, 3:] * 255, np.ones([vis_obj_xyz.shape[0], 1]) * 255], axis=-1))
+        # vis_pc.export(pc_filename)
+        #
+        # scene = trimesh.Scene()
+        # # add the coordinate frame first
+        # # geom = trimesh.creation.axis(0.01)
+        # # scene.add_geometry(geom)
+        # table = trimesh.creation.box(extents=[1.0, 1.0, 0.02])
+        # table.apply_translation([0.5, 0, -0.01])
+        # table.visual.vertex_colors = [150, 111, 87, 125]
+        # scene.add_geometry(table)
+        # # bounds = trimesh.creation.box(extents=[4.0, 4.0, 4.0])
+        # # bounds = trimesh.creation.icosphere(subdivisions=3, radius=3.1)
+        # # bounds.apply_translation([0, 0, 0])
+        # # bounds.visual.vertex_colors = [30, 30, 30, 30]
+        # # scene.add_geometry(bounds)
+        # # RT_4x4 = np.array([[-0.39560353822208355, -0.9183993826406329, 0.006357240869497738, 0.2651463080169481],
+        # #                    [-0.797630370081598, 0.3401340617616391, -0.4980909683511864, 0.2225696480721997],
+        # #                    [0.45528412367406523, -0.2021172778236285, -0.8671014777611122, 0.9449050652025951],
+        # #                    [0.0, 0.0, 0.0, 1.0]])
+        # # RT_4x4 = np.linalg.inv(RT_4x4)
+        # # RT_4x4 = RT_4x4 @ np.diag([1, -1, -1, 1])
+        # # scene.camera_transform = RT_4x4
+        #
+        # mesh_list = trimesh.util.concatenate(scene.dump())
+        # print(mesh_list)
+        # trimesh.io.export.export_mesh(mesh_list, scene_filename, file_type='obj')
+        return scene_filename
+args = OmegaConf.create()
+args.base_config_file = "./configs/base.yaml"
+args.config_file = "./configs/conditional_pose_diffusion.yaml"
+args.checkpoint_id = "ConditionalPoseDiffusion"
+args.eval_random_seed = 42
+args.num_samples = 1
+base_cfg = OmegaConf.load(args.base_config_file)
+cfg = OmegaConf.load(args.config_file)
+cfg = OmegaConf.merge(base_cfg, cfg)
+infer_wrapper = Infer_Wrapper(args, cfg)
+demo = gr.Interface(
+    fn=infer_wrapper.run,
+    inputs=gr.Slider(0, len(infer_wrapper.dataset)),
+    # clear color range [0-1.0]
+    outputs=gr.Model3D(clear_color=[0, 0, 0, 0],  label="3D Model")
+)
+demo.launch()

configs/base.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_dirs:
+  data: data
+  wandb_dir: wandb_logs

configs/conditional_pose_diffusion.yaml ADDED Viewed

	@@ -0,0 +1,81 @@

+random_seed: 1
+WANDB:
+  project: StructDiffusion
+  save_dir: ${base_dirs.wandb_dir}
+  name: conditional_pose_diffusion
+DATASET:
+  data_root: ${base_dirs.data}
+  vocab_dir: ${base_dirs.data}/type_vocabs_coarse.json
+  # important
+  use_virtual_structure_frame: True
+  ignore_distractor_objects: True
+  ignore_rgb: True
+  # the following are determined by the dataset
+  max_num_target_objects: 7
+  max_num_distractor_objects: 5
+  max_num_shape_parameters: 5
+  # set to zeros because they are not used for now
+  max_num_rearrange_features: 0
+  max_num_anchor_features: 0
+  num_pts: 1024
+  filter_num_moved_objects_range:
+  data_augmentation: False
+DATALOADER:
+  batch_size: 64
+  num_workers: 8
+  pin_memory: True
+MODEL:
+  # transformer encoder
+  encoder_input_dim: 256
+  num_attention_heads: 8
+  encoder_hidden_dim: 512
+  encoder_dropout: 0.0
+  encoder_activation: relu
+  encoder_num_layers: 8
+  # output head
+  structure_dropout: 0
+  object_dropout: 0
+  # pc encoder
+  ignore_rgb: ${DATASET.ignore_rgb}
+  pc_emb_dim: 256
+  posed_pc_emb_dim: 80
+  # pose encoder
+  pose_emb_dim: 80
+  # language
+  word_emb_dim: 160
+  # diffusion step
+  time_emb_dim: 80
+  # sequence embeddings
+  # max_num_target_objects (+ max_num_distractor_objects if not ignore_distractor_objects)
+  max_seq_size: 7
+  max_token_type_size: 4
+  seq_pos_emb_dim: 8
+  seq_type_emb_dim: 8
+  # virtual frame
+  use_virtual_structure_frame: ${DATASET.use_virtual_structure_frame}
+NOISE_SCHEDULE:
+  timesteps: 200
+LOSS:
+  type: huber
+OPTIMIZER:
+  lr: 0.0001
+  weight_decay: 0  #0.0001
+  # lr_restart: 3000
+  # warmup: 10
+TRAINER:
+  max_epochs: 200
+  gradient_clip_val: 1.0
+  gpus: 1
+  deterministic: False
+  # enable_progress_bar: False

configs/pairwise_collision.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+random_seed: 1
+WANDB:
+  project: StructDiffusion
+  save_dir: ${base_dirs.wandb_dir}
+  name: pairwise_collision
+DATASET:
+  urdf_pc_idx_file: ${base_dirs.pairwise_collision_data}/urdf_pc_idx.pkl
+  collision_data_dir: ${base_dirs.pairwise_collision_data}
+  # important
+  num_pts: 1024
+  num_scene_pts: 2048
+  normalize_pc: True
+  random_rotation: True
+  data_augmentation: False
+DATALOADER:
+  batch_size: 32
+  num_workers: 8
+  pin_memory: True
+MODEL:
+  max_num_objects: 2
+  include_env_pc: False
+  pct_random_sampling: True
+LOSS:
+  type: Focal
+  focal_gamma: 2
+OPTIMIZER:
+  lr: 0.0001
+  weight_decay: 0
+TRAINER:
+  max_epochs: 200
+  gradient_clip_val: 1.0
+  gpus: 1
+  deterministic: False
+  # enable_progress_bar: False

data/data00000000.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:947574252625d338b9f37217eacf61f520136e27b458b6d3e65330339e8b299c
+size 1271489

data/data00000002.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3302432de555fed767c5b0d99c35ca01d5e4ac38cf4a0760b8ccb456b432e0e0
+size 3235242

data/data00000003.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b907ba7c3a17f98a438617b462b2a4d3d3f8593c2dc47feb5a6cc3da8c034fc
+size 2059708

data/data00000004.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8ec0136dd4055d304e9b7f5697b79613099b8f8f1e5eec94281f22d8d47cca1
+size 2591656

data/data00000006.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e74ebf185b0af58df0fa2483d5fd58a12b3b62ccac27ff665f35c5c7a13b8d8
+size 1572332

data/data00000008.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db015354a9d53e6fbaf0b040ce226484150b0af226a5c13a0b9f5cb9961db73c
+size 2167265

data/data00000009.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:990ad13f423d9089b30de81d002d23d9d00cf3e007fd7073793cbec03c456ebb
+size 3607752

data/data00000012.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93161f5666c54dbc259c9efa516b67340613b592a1ed42e6c63d4cc8a495002a
+size 2525622

data/data00000013.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94c9cfe6d9f0df176eb0a3baccdf53c7e6e5fc807e5e7ea9e138ad7159f500d9
+size 1715352

data/data00000015.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aab522f9ace1a03b1705fe3bd693b589971d133d45c232ef9ec53842a540bfa
+size 2647026

data/type_vocabs_coarse.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"class": {"Basket": 0, "BeerBottle": 1, "Book": 2, "Bottle": 3, "Bowl": 4, "Calculator": 5, "Candle": 6, "CellPhone": 7, "ComputerMouse": 8, "Controller": 9, "Cup": 10, "Donut": 11, "Fork": 12, "Hammer": 13, "Knife": 14, "Marker": 15, "MilkCarton": 16, "Mug": 17, "Pan": 18, "Pen": 19, "PillBottle": 20, "Plate": 21, "PowerStrip": 22, "Scissors": 23, "SoapBottle": 24, "SodaCan": 25, "Spoon": 26, "Stapler": 27, "Teapot": 28, "VideoGameController": 29, "WineBottle": 30, "CanOpener":31, "Fruit": 32}, "scene": {"dinner": 0}, "size": {"L": 0, "M": 1, "S": 2}, "color": {"blue": 0, "cyan": 1, "green": 2, "magenta": 3, "red": 4, "yellow": 5}, "material": {"glass": 0, "metal": 1, "plastic": 2}, "comparator": {"less": 1, "greater": 2, "equal": 3}, "radius": [0.0, 0.5, 3], "position_x": [-0.1, 1.0, 3], "position_y": [-0.5, 0.5, 3], "rotation": [-3.15, 3.15, 4], "height": [0.0, 0.5, 10], "volumn": [0.0, 0.015, 10], "uniform_angle": {"False": 0, "True": 1}, "face_center": {"False": 0, "True": 1}, "angle_ratio": {"0.5": 0, "1.0": 1}, "shape": {"circle": 0, "line": 1, "tower": 2, "dinner": 3}, "obj_x": [-1.0, 1.0, 200], "obj_y": [-1.0, 1.0, 200], "obj_z": [-1.0, 1.0, 200], "obj_rr": [-3.15, 3.15, 360], "obj_rp": [-3.15, 3.15, 360], "obj_ry": [-3.15, 3.15, 360],"struct_x": [-1.0, 1.0, 200], "struct_y": [-1.0, 1.0, 200], "struct_z": [-1.0, 1.0, 200], "struct_rr": [-3.15, 3.15, 360], "struct_rp": [-3.15, 3.15, 360], "struct_ry": [-3.15, 3.15, 360]}

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3-opencv

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+numpy==1.21
+h5py==2.10.0
+opencv-python
+open3d
+trimesh==3.10.2
+pyglet==1.5.0
+pybullet==3.1.7
+nvisii==1.1.70
+openpyxl
+pytorch_lightning==1.6.1
+wandb===0.13.10
+pytorch3d==0.3.0
+omegaconf==2.2.2

scripts/infer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import argparse
+import torch
+import numpy as np
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+from StructDiffusion.data.semantic_arrangement import SemanticArrangementDataset
+from StructDiffusion.language.tokenizer import Tokenizer
+from StructDiffusion.models.pl_models import ConditionalPoseDiffusionModel
+from StructDiffusion.diffusion.sampler import Sampler
+from StructDiffusion.diffusion.pose_conversion import get_struct_objs_poses
+from StructDiffusion.utils.files import get_checkpoint_path_from_dir
+from StructDiffusion.utils.batch_inference import move_pc_and_create_scene_simple, visualize_batch_pcs
+def main(args, cfg):
+    pl.seed_everything(args.eval_random_seed)
+    device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
+    checkpoint_dir = os.path.join(cfg.WANDB.save_dir, cfg.WANDB.project, args.checkpoint_id, "checkpoints")
+    checkpoint_path = get_checkpoint_path_from_dir(checkpoint_dir)
+    if args.eval_mode == "infer":
+        tokenizer = Tokenizer(cfg.DATASET.vocab_dir)
+        # override ignore_rgb for visualization
+        cfg.DATASET.ignore_rgb = False
+        dataset = SemanticArrangementDataset(split="test", tokenizer=tokenizer, **cfg.DATASET)
+        sampler = Sampler(ConditionalPoseDiffusionModel, checkpoint_path, device)
+        data_idxs = np.random.permutation(len(dataset))
+        for di in data_idxs:
+            raw_datum = dataset.get_raw_data(di)
+            print(tokenizer.convert_structure_params_to_natural_language(raw_datum["sentence"]))
+            datum = dataset.convert_to_tensors(raw_datum, tokenizer)
+            batch = dataset.single_datum_to_batch(datum, args.num_samples, device, inference_mode=True)
+            num_poses = datum["goal_poses"].shape[0]
+            xs = sampler.sample(batch, num_poses)
+            struct_pose, pc_poses_in_struct = get_struct_objs_poses(xs[0])
+            new_obj_xyzs = move_pc_and_create_scene_simple(batch["pcs"], struct_pose, pc_poses_in_struct)
+            visualize_batch_pcs(new_obj_xyzs, args.num_samples, limit_B=10, trimesh=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="infer")
+    parser.add_argument("--base_config_file", help='base config yaml file',
+                        default='../configs/base.yaml',
+                        type=str)
+    parser.add_argument("--config_file", help='config yaml file',
+                        default='../configs/conditional_pose_diffusion.yaml',
+                        type=str)
+    parser.add_argument("--checkpoint_id",
+                        default="ConditionalPoseDiffusion",
+                        type=str)
+    parser.add_argument("--eval_mode",
+                        default="infer",
+                        type=str)
+    parser.add_argument("--eval_random_seed",
+                        default=42,
+                        type=int)
+    parser.add_argument("--num_samples",
+                        default=10,
+                        type=int)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load(args.base_config_file)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(base_cfg, cfg)
+    main(args, cfg)

scripts/infer_with_discriminator.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import argparse
+import torch
+import numpy as np
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+from StructDiffusion.data.semantic_arrangement import SemanticArrangementDataset
+from StructDiffusion.language.tokenizer import Tokenizer
+from StructDiffusion.models.pl_models import ConditionalPoseDiffusionModel, PairwiseCollisionModel
+from StructDiffusion.diffusion.sampler import SamplerV2
+from StructDiffusion.diffusion.pose_conversion import get_struct_objs_poses
+from StructDiffusion.utils.files import get_checkpoint_path_from_dir
+from StructDiffusion.utils.batch_inference import move_pc_and_create_scene_simple, visualize_batch_pcs
+def main(args, cfg):
+    pl.seed_everything(args.eval_random_seed)
+    device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
+    diffusion_checkpoint_path = get_checkpoint_path_from_dir(os.path.join(cfg.WANDB.save_dir, cfg.WANDB.project, args.diffusion_checkpoint_id, "checkpoints"))
+    collision_checkpoint_path = get_checkpoint_path_from_dir(os.path.join(cfg.WANDB.save_dir, cfg.WANDB.project, args.collision_checkpoint_id, "checkpoints"))
+    if args.eval_mode == "infer":
+        tokenizer = Tokenizer(cfg.DATASET.vocab_dir)
+        # override ignore_rgb for visualization
+        cfg.DATASET.ignore_rgb = False
+        dataset = SemanticArrangementDataset(split="test", tokenizer=tokenizer, **cfg.DATASET)
+        sampler = SamplerV2(ConditionalPoseDiffusionModel, diffusion_checkpoint_path,
+                            PairwiseCollisionModel, collision_checkpoint_path, device)
+        data_idxs = np.random.permutation(len(dataset))
+        for di in data_idxs:
+            raw_datum = dataset.get_raw_data(di)
+            print(tokenizer.convert_structure_params_to_natural_language(raw_datum["sentence"]))
+            datum = dataset.convert_to_tensors(raw_datum, tokenizer)
+            batch = dataset.single_datum_to_batch(datum, args.num_samples, device, inference_mode=True)
+            num_poses = datum["goal_poses"].shape[0]
+            struct_pose, pc_poses_in_struct = sampler.sample(batch, num_poses)
+            new_obj_xyzs = move_pc_and_create_scene_simple(batch["pcs"], struct_pose, pc_poses_in_struct)
+            visualize_batch_pcs(new_obj_xyzs, args.num_samples, limit_B=10, trimesh=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="infer")
+    parser.add_argument("--base_config_file", help='base config yaml file',
+                        default='../configs/base.yaml',
+                        type=str)
+    parser.add_argument("--config_file", help='config yaml file',
+                        default='../configs/conditional_pose_diffusion.yaml',
+                        type=str)
+    parser.add_argument("--diffusion_checkpoint_id",
+                        default="ConditionalPoseDiffusion",
+                        type=str)
+    parser.add_argument("--collision_checkpoint_id",
+                        default="curhl56k",
+                        type=str)
+    parser.add_argument("--eval_mode",
+                        default="infer",
+                        type=str)
+    parser.add_argument("--eval_random_seed",
+                        default=42,
+                        type=int)
+    parser.add_argument("--num_samples",
+                        default=10,
+                        type=int)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load(args.base_config_file)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(base_cfg, cfg)
+    main(args, cfg)

scripts/train_discriminator.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import argparse
+import torch
+from torch.utils.data import DataLoader
+from omegaconf import OmegaConf
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from StructDiffusion.data.pairwise_collision import PairwiseCollisionDataset
+from StructDiffusion.models.pl_models import PairwiseCollisionModel
+def main(cfg):
+    pl.seed_everything(cfg.random_seed)
+    wandb_logger = WandbLogger(**cfg.WANDB)
+    wandb_logger.experiment.config.update(cfg)
+    checkpoint_callback = ModelCheckpoint()
+    full_dataset = PairwiseCollisionDataset(**cfg.DATASET)
+    train_dataset, valid_dataset = torch.utils.data.random_split(full_dataset, [int(len(full_dataset) * 0.7), len(full_dataset) - int(len(full_dataset) * 0.7)])
+    train_dataloader = DataLoader(train_dataset, shuffle=True, **cfg.DATALOADER)
+    valid_dataloader = DataLoader(valid_dataset, shuffle=False, **cfg.DATALOADER)
+    model = PairwiseCollisionModel(cfg.MODEL, cfg.LOSS, cfg.OPTIMIZER, cfg.DATASET)
+    trainer = pl.Trainer(logger=wandb_logger, callbacks=[checkpoint_callback], **cfg.TRAINER)
+    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=valid_dataloader)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--base_config_file", help='base config yaml file',
+                        default='../configs/base.yaml',
+                        type=str)
+    parser.add_argument("--config_file", help='config yaml file',
+                        default='../configs/pairwise_collision.yaml',
+                        type=str)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load(args.base_config_file)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(base_cfg, cfg)
+    main(cfg)

scripts/train_generator.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from torch.utils.data import DataLoader
+import argparse
+from omegaconf import OmegaConf
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from StructDiffusion.data.semantic_arrangement import SemanticArrangementDataset
+from StructDiffusion.language.tokenizer import Tokenizer
+from StructDiffusion.models.pl_models import ConditionalPoseDiffusionModel
+def main(cfg):
+    pl.seed_everything(cfg.random_seed)
+    wandb_logger = WandbLogger(**cfg.WANDB)
+    wandb_logger.experiment.config.update(cfg)
+    checkpoint_callback = ModelCheckpoint()
+    tokenizer = Tokenizer(cfg.DATASET.vocab_dir)
+    vocab_size = tokenizer.get_vocab_size()
+    train_dataset = SemanticArrangementDataset(split="train", tokenizer=tokenizer, **cfg.DATASET)
+    valid_dataset = SemanticArrangementDataset(split="valid", tokenizer=tokenizer, **cfg.DATASET)
+    train_dataloader = DataLoader(train_dataset, shuffle=True, **cfg.DATALOADER)
+    valid_dataloader = DataLoader(valid_dataset, shuffle=False, **cfg.DATALOADER)
+    model = ConditionalPoseDiffusionModel(vocab_size, cfg.MODEL, cfg.LOSS, cfg.NOISE_SCHEDULE, cfg.OPTIMIZER)
+    trainer = pl.Trainer(logger=wandb_logger, callbacks=[checkpoint_callback], **cfg.TRAINER)
+    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=valid_dataloader)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="train")
+    parser.add_argument("--base_config_file", help='base config yaml file',
+                        default='../configs/base.yaml',
+                        type=str)
+    parser.add_argument("--config_file", help='config yaml file',
+                        default='../configs/conditional_pose_diffusion.yaml',
+                        type=str)
+    args = parser.parse_args()
+    base_cfg = OmegaConf.load(args.base_config_file)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(base_cfg, cfg)
+    main(cfg)

src/StructDiffusion/__init__.py ADDED Viewed

File without changes

src/StructDiffusion/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (171 Bytes). View file

src/StructDiffusion/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (175 Bytes). View file

src/StructDiffusion/data/__init__.py ADDED Viewed

File without changes

src/StructDiffusion/data/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (176 Bytes). View file

src/StructDiffusion/data/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (180 Bytes). View file

src/StructDiffusion/data/__pycache__/pairwise_collision.cpython-37.pyc ADDED Viewed

Binary file (9.72 kB). View file

src/StructDiffusion/data/__pycache__/semantic_arrangement.cpython-37.pyc ADDED Viewed

Binary file (17.1 kB). View file

src/StructDiffusion/data/__pycache__/semantic_arrangement.cpython-38.pyc ADDED Viewed

Binary file (17.1 kB). View file

src/StructDiffusion/data/__pycache__/semantic_arrangement_demo.cpython-38.pyc ADDED Viewed

Binary file (16.4 kB). View file

src/StructDiffusion/data/pairwise_collision.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import cv2
+import h5py
+import numpy as np
+import os
+import trimesh
+import torch
+import json
+from collections import defaultdict
+import tqdm
+import pickle
+from random import shuffle
+# Local imports
+from StructDiffusion.utils.rearrangement import show_pcs, get_pts, array_to_tensor
+from StructDiffusion.utils.pointnet import pc_normalize
+import StructDiffusion.utils.brain2.camera as cam
+import StructDiffusion.utils.brain2.image as img
+import StructDiffusion.utils.transformations as tra
+def load_pairwise_collision_data(h5_filename):
+    fh = h5py.File(h5_filename, 'r')
+    data_dict = {}
+    data_dict["obj1_info"] = eval(fh["obj1_info"][()])
+    data_dict["obj2_info"] = eval(fh["obj2_info"][()])
+    data_dict["obj1_poses"] = fh["obj1_poses"][:]
+    data_dict["obj2_poses"] = fh["obj2_poses"][:]
+    data_dict["intersection_labels"] = fh["intersection_labels"][:]
+    return data_dict
+class PairwiseCollisionDataset(torch.utils.data.Dataset):
+    def __init__(self, urdf_pc_idx_file, collision_data_dir, random_rotation=True,
+                 num_pts=1024, normalize_pc=True, num_scene_pts=2048, data_augmentation=False,
+                 debug=False):
+        # load dictionary mapping from urdf to list of pc data, each sample is
+        #   {"step_t": step_t, "obj": obj, "filename": filename}
+        with open(urdf_pc_idx_file, "rb") as fh:
+            self.urdf_to_pc_data = pickle.load(fh)
+        # filter out broken files
+        for urdf in self.urdf_to_pc_data:
+            valid_pc_data = []
+            for pd in self.urdf_to_pc_data[urdf]:
+                filename =  pd["filename"]
+                if "data00026058" in filename or "data00011415" in filename or "data00026061" in filename or "data00700565" in filename or "data00505290" in filename:
+                    continue
+                valid_pc_data.append(pd)
+            if valid_pc_data:
+                self.urdf_to_pc_data[urdf] = valid_pc_data
+        # build data index
+        # each sample is a tuple of (collision filename, idx for the labels and poses)
+        if collision_data_dir is not None:
+            self.data_idxs = self.build_data_idxs(collision_data_dir)
+        else:
+            print("WARNING: collision_data_dir is None")
+        self.num_pts = num_pts
+        self.debug = debug
+        self.normalize_pc = normalize_pc
+        self.num_scene_pts = num_scene_pts
+        self.random_rotation = random_rotation
+        # Noise
+        self.data_augmentation = data_augmentation
+        # additive noise
+        self.gp_rescale_factor_range = [12, 20]
+        self.gaussian_scale_range = [0., 0.003]
+        # multiplicative noise
+        self.gamma_shape = 1000.
+        self.gamma_scale = 0.001
+    def build_data_idxs(self, collision_data_dir):
+        print("Load collision data...")
+        positive_data = []
+        negative_data = []
+        for filename in tqdm.tqdm(os.listdir(collision_data_dir)):
+            if "h5" not in filename:
+                continue
+            h5_filename = os.path.join(collision_data_dir, filename)
+            data_dict = load_pairwise_collision_data(h5_filename)
+            obj1_urdf = data_dict["obj1_info"]["urdf"]
+            obj2_urdf = data_dict["obj2_info"]["urdf"]
+            if obj1_urdf not in self.urdf_to_pc_data:
+                print("no pc data for urdf:", obj1_urdf)
+                continue
+            if obj2_urdf not in self.urdf_to_pc_data:
+                print("no pc data for urdf:", obj2_urdf)
+                continue
+            for idx, l in enumerate(data_dict["intersection_labels"]):
+                if l:
+                    # intersection
+                    positive_data.append((h5_filename, idx))
+                else:
+                    negative_data.append((h5_filename, idx))
+        print("Num pairwise intersections:", len(positive_data))
+        print("Num pairwise no intersections:", len(negative_data))
+        if len(negative_data) != len(positive_data):
+            min_len = min(len(negative_data), len(positive_data))
+            positive_data = [positive_data[i] for i in np.random.permutation(len(positive_data))[:min_len]]
+            negative_data = [negative_data[i] for i in np.random.permutation(len(negative_data))[:min_len]]
+            print("after balancing")
+            print("Num pairwise intersections:", len(positive_data))
+            print("Num pairwise no intersections:", len(negative_data))
+        return positive_data + negative_data
+    def create_urdf_pc_idxs(self, urdf_pc_idx_file, data_roots, index_roots):
+        print("Load pc data")
+        arrangement_steps = []
+        for split in ["train"]:
+            for data_root, index_root in zip(data_roots, index_roots):
+                arrangement_indices_file = os.path.join(data_root, index_root,"{}_arrangement_indices_file_all.txt".format(split))
+                if os.path.exists(arrangement_indices_file):
+                    with open(arrangement_indices_file, "r") as fh:
+                        arrangement_steps.extend([(os.path.join(data_root, f[0]), f[1]) for f in eval(fh.readline().strip())])
+                else:
+                    print("{} does not exist".format(arrangement_indices_file))
+        urdf_to_pc_data = defaultdict(list)
+        for filename, step_t in tqdm.tqdm(arrangement_steps):
+            h5 = h5py.File(filename, 'r')
+            ids = self._get_ids(h5)
+            # moved_objs = h5['moved_objs'][()].split(',')
+            all_objs = sorted([o for o in ids.keys() if "object_" in o])
+            goal_specification = json.loads(str(np.array(h5["goal_specification"])))
+            obj_infos = goal_specification["rearrange"]["objects"] + goal_specification["anchor"]["objects"] + goal_specification["distract"]["objects"]
+            for obj, obj_info in zip(all_objs, obj_infos):
+                urdf_to_pc_data[obj_info["urdf"]].append({"step_t": step_t, "obj": obj, "filename": filename})
+        with open(urdf_pc_idx_file, "wb") as fh:
+            pickle.dump(urdf_to_pc_data, fh)
+        return urdf_to_pc_data
+    def add_noise_to_depth(self, depth_img):
+        """ add depth noise """
+        multiplicative_noise = np.random.gamma(self.gamma_shape, self.gamma_scale)
+        depth_img = multiplicative_noise * depth_img
+        return depth_img
+    def add_noise_to_xyz(self, xyz_img, depth_img):
+        """ TODO: remove this code or at least celean it up"""
+        xyz_img = xyz_img.copy()
+        H, W, C = xyz_img.shape
+        gp_rescale_factor = np.random.randint(self.gp_rescale_factor_range[0],
+                                              self.gp_rescale_factor_range[1])
+        gp_scale = np.random.uniform(self.gaussian_scale_range[0],
+                                     self.gaussian_scale_range[1])
+        small_H, small_W = (np.array([H, W]) / gp_rescale_factor).astype(int)
+        additive_noise = np.random.normal(loc=0.0, scale=gp_scale, size=(small_H, small_W, C))
+        additive_noise = cv2.resize(additive_noise, (W, H), interpolation=cv2.INTER_CUBIC)
+        xyz_img[depth_img > 0, :] += additive_noise[depth_img > 0, :]
+        return xyz_img
+    def _get_images(self, h5, idx, ee=True):
+        if ee:
+            RGB, DEPTH, SEG = "ee_rgb", "ee_depth", "ee_seg"
+            DMIN, DMAX = "ee_depth_min", "ee_depth_max"
+        else:
+            RGB, DEPTH, SEG = "rgb", "depth", "seg"
+            DMIN, DMAX = "depth_min", "depth_max"
+        dmin = h5[DMIN][idx]
+        dmax = h5[DMAX][idx]
+        rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255.  # remove alpha
+        depth1 = h5[DEPTH][idx] / 20000. * (dmax - dmin) + dmin
+        seg1 = img.PNGToNumpy(h5[SEG][idx])
+        valid1 = np.logical_and(depth1 > 0.1, depth1 < 2.)
+        # proj_matrix = h5['proj_matrix'][()]
+        camera = cam.get_camera_from_h5(h5)
+        if self.data_augmentation:
+            depth1 = self.add_noise_to_depth(depth1)
+        xyz1 = cam.compute_xyz(depth1, camera)
+        if self.data_augmentation:
+            xyz1 = self.add_noise_to_xyz(xyz1, depth1)
+        # Transform the point cloud
+        # Here it is...
+        # CAM_POSE = "ee_cam_pose" if ee else "cam_pose"
+        CAM_POSE = "ee_camera_view" if ee else "camera_view"
+        cam_pose = h5[CAM_POSE][idx]
+        if ee:
+            # ee_camera_view has 0s for x, y, z
+            cam_pos = h5["ee_cam_pose"][:][:3, 3]
+            cam_pose[:3, 3] = cam_pos
+        # Get transformed point cloud
+        h, w, d = xyz1.shape
+        xyz1 = xyz1.reshape(h * w, -1)
+        xyz1 = trimesh.transform_points(xyz1, cam_pose)
+        xyz1 = xyz1.reshape(h, w, -1)
+        scene1 = rgb1, depth1, seg1, valid1, xyz1
+        return scene1
+    def _get_ids(self, h5):
+        """
+        get object ids
+        @param h5:
+        @return:
+        """
+        ids = {}
+        for k in h5.keys():
+            if k.startswith("id_"):
+                ids[k[3:]] = h5[k][()]
+        return ids
+    def get_obj_pc(self, h5, step_t, obj):
+        scene = self._get_images(h5, step_t, ee=True)
+        rgb, depth, seg, valid, xyz = scene
+        # getting object point clouds
+        ids = self._get_ids(h5)
+        obj_mask = np.logical_and(seg == ids[obj], valid)
+        if np.sum(obj_mask) <= 0:
+            raise Exception
+        ok, obj_xyz, obj_rgb, _ = get_pts(xyz, rgb, obj_mask, num_pts=self.num_pts, to_tensor=False)
+        obj_pc_center = np.mean(obj_xyz, axis=0)
+        obj_pose = h5[obj][step_t]
+        obj_pc_pose = np.eye(4)
+        obj_pc_pose[:3, 3] = obj_pc_center[:3]
+        return obj_xyz, obj_rgb, obj_pc_pose, obj_pose
+    def __len__(self):
+        return len(self.data_idxs)
+    def __getitem__(self, idx):
+        collision_filename, collision_idx = self.data_idxs[idx]
+        collision_data_dict = load_pairwise_collision_data(collision_filename)
+        obj1_urdf = collision_data_dict["obj1_info"]["urdf"]
+        obj2_urdf = collision_data_dict["obj2_info"]["urdf"]
+        # TODO: find a better way to sample pc data?
+        obj1_pc_data = np.random.choice(self.urdf_to_pc_data[obj1_urdf])
+        obj2_pc_data = np.random.choice(self.urdf_to_pc_data[obj2_urdf])
+        obj1_xyz, obj1_rgb, obj1_pc_pose, obj1_pose = self.get_obj_pc(h5py.File(obj1_pc_data["filename"], "r"), obj1_pc_data["step_t"], obj1_pc_data["obj"])
+        obj2_xyz, obj2_rgb, obj2_pc_pose, obj2_pose = self.get_obj_pc(h5py.File(obj2_pc_data["filename"], "r"), obj2_pc_data["step_t"], obj2_pc_data["obj"])
+        obj1_c_pose = collision_data_dict["obj1_poses"][collision_idx]
+        obj2_c_pose = collision_data_dict["obj2_poses"][collision_idx]
+        label = collision_data_dict["intersection_labels"][collision_idx]
+        obj1_transform = obj1_c_pose @ np.linalg.inv(obj1_pose)
+        obj2_transform = obj2_c_pose @ np.linalg.inv(obj2_pose)
+        obj1_c_xyz = trimesh.transform_points(obj1_xyz, obj1_transform)
+        obj2_c_xyz = trimesh.transform_points(obj2_xyz, obj2_transform)
+        # if self.debug:
+        #     show_pcs([obj1_c_xyz, obj2_c_xyz], [obj1_rgb, obj2_rgb], add_coordinate_frame=True)
+        ###################################
+        obj_xyzs = [obj1_c_xyz, obj2_c_xyz]
+        shuffle(obj_xyzs)
+        num_indicator = 2
+        new_obj_xyzs = []
+        for oi, obj_xyz in enumerate(obj_xyzs):
+            obj_xyz = np.concatenate([obj_xyz, np.tile(np.eye(num_indicator)[oi], (obj_xyz.shape[0], 1))], axis=1)
+            new_obj_xyzs.append(obj_xyz)
+        scene_xyz = np.concatenate(new_obj_xyzs, axis=0)
+        # subsampling and normalizing pc
+        idx = np.random.randint(0, scene_xyz.shape[0], self.num_scene_pts)
+        scene_xyz = scene_xyz[idx]
+        if self.normalize_pc:
+            scene_xyz[:, 0:3] = pc_normalize(scene_xyz[:, 0:3])
+        if self.random_rotation:
+            scene_xyz[:, 0:3] = trimesh.transform_points(scene_xyz[:, 0:3], tra.euler_matrix(0, 0, np.random.uniform(low=0, high=2 * np.pi)))
+        ###################################
+        scene_xyz = array_to_tensor(scene_xyz)
+        # convert to torch data
+        label = int(label)
+        if self.debug:
+            print("intersection:", label)
+            show_pcs([scene_xyz[:, 0:3]], [np.tile(np.array([0, 1, 0], dtype=np.float), (scene_xyz.shape[0], 1))], add_coordinate_frame=True)
+        datum = {
+            "scene_xyz": scene_xyz,
+            "label": torch.FloatTensor([label]),
+        }
+        return datum
+    # @staticmethod
+    # def collate_fn(data):
+    #     """
+    #     :param data:
+    #     :return:
+    #     """
+    #
+    #     batched_data_dict = {}
+    #     for key in ["is_circle"]:
+    #         batched_data_dict[key] = torch.cat([dict[key] for dict in data], dim=0)
+    #     for key in ["scene_xyz"]:
+    #         batched_data_dict[key] = torch.stack([dict[key] for dict in data], dim=0)
+    #
+    #     return batched_data_dict
+    #
+    # # def create_pair_xyzs_from_obj_xyzs(self, new_obj_xyzs, debug=False):
+    # #
+    # #     new_obj_xyzs = [xyz.cpu().numpy() for xyz in new_obj_xyzs]
+    # #
+    # #     # compute pairwise collision
+    # #     scene_xyzs = []
+    # #     obj_xyz_pair_idxs = list(itertools.combinations(range(len(new_obj_xyzs)), 2))
+    # #
+    # #     for obj_xyz_pair_idx in obj_xyz_pair_idxs:
+    # #         obj_xyz_pair = [new_obj_xyzs[obj_xyz_pair_idx[0]], new_obj_xyzs[obj_xyz_pair_idx[1]]]
+    # #         num_indicator = 2
+    # #         obj_xyz_pair_ind = []
+    # #         for oi, obj_xyz in enumerate(obj_xyz_pair):
+    # #             obj_xyz = np.concatenate([obj_xyz, np.tile(np.eye(num_indicator)[oi], (obj_xyz.shape[0], 1))], axis=1)
+    # #             obj_xyz_pair_ind.append(obj_xyz)
+    # #         pair_scene_xyz = np.concatenate(obj_xyz_pair_ind, axis=0)
+    # #
+    # #         # subsampling and normalizing pc
+    # #         rand_idx = np.random.randint(0, pair_scene_xyz.shape[0], self.num_scene_pts)
+    # #         pair_scene_xyz = pair_scene_xyz[rand_idx]
+    # #         if self.normalize_pc:
+    # #             pair_scene_xyz[:, 0:3] = pc_normalize(pair_scene_xyz[:, 0:3])
+    # #
+    # #         scene_xyzs.append(array_to_tensor(pair_scene_xyz))
+    # #
+    # #     if debug:
+    # #         for scene_xyz in scene_xyzs:
+    # #             show_pcs([scene_xyz[:, 0:3]], [np.tile(np.array([0, 1, 0], dtype=np.float), (scene_xyz.shape[0], 1))],
+    # #                      add_coordinate_frame=True)
+    # #
+    # #     return scene_xyzs
+if __name__ == "__main__":
+    dataset = PairwiseCollisionDataset(urdf_pc_idx_file="/home/weiyu/data_drive/StructDiffusion/pairwise_collision_data/urdf_pc_idx.pkl",
+                      collision_data_dir="/home/weiyu/data_drive/StructDiffusion/pairwise_collision_data",
+                      debug=False)
+    for i in tqdm.tqdm(np.random.permutation(len(dataset))):
+        # print(i)
+        d = dataset[i]
+        # print(d["label"])
+    # dl = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=8)
+    # for b in tqdm.tqdm(dl):
+    #     pass

src/StructDiffusion/data/semantic_arrangement.py ADDED Viewed

	@@ -0,0 +1,579 @@

+import copy
+import cv2
+import h5py
+import numpy as np
+import os
+import trimesh
+import torch
+from tqdm import tqdm
+import json
+import random
+from torch.utils.data import DataLoader
+# Local imports
+from StructDiffusion.utils.rearrangement import show_pcs, get_pts, combine_and_sample_xyzs
+from StructDiffusion.language.tokenizer import Tokenizer
+import StructDiffusion.utils.brain2.camera as cam
+import StructDiffusion.utils.brain2.image as img
+import StructDiffusion.utils.transformations as tra
+class SemanticArrangementDataset(torch.utils.data.Dataset):
+    def __init__(self, data_roots, index_roots, split, tokenizer,
+                 max_num_target_objects=11, max_num_distractor_objects=5,
+                 max_num_shape_parameters=7, max_num_rearrange_features=1, max_num_anchor_features=3,
+                 num_pts=1024,
+                 use_virtual_structure_frame=True, ignore_distractor_objects=True, ignore_rgb=True,
+                 filter_num_moved_objects_range=None, shuffle_object_index=False,
+                 data_augmentation=True, debug=False, **kwargs):
+        """
+        Note: setting filter_num_moved_objects_range=[k, k] and max_num_objects=k will create no padding for target objs
+        :param data_root:
+        :param split: train, valid, or test
+        :param shuffle_object_index: whether to shuffle the positions of target objects and other objects in the sequence
+        :param debug:
+        :param max_num_shape_parameters:
+        :param max_num_objects:
+        :param max_num_rearrange_features:
+        :param max_num_anchor_features:
+        :param num_pts:
+        :param use_stored_arrangement_indices:
+        :param kwargs:
+        """
+        self.use_virtual_structure_frame = use_virtual_structure_frame
+        self.ignore_distractor_objects = ignore_distractor_objects
+        self.ignore_rgb = ignore_rgb and not debug
+        self.num_pts = num_pts
+        self.debug = debug
+        self.max_num_objects = max_num_target_objects
+        self.max_num_other_objects = max_num_distractor_objects
+        self.max_num_shape_parameters = max_num_shape_parameters
+        self.max_num_rearrange_features = max_num_rearrange_features
+        self.max_num_anchor_features = max_num_anchor_features
+        self.shuffle_object_index = shuffle_object_index
+        # used to tokenize the language part
+        self.tokenizer = tokenizer
+        # retrieve data
+        self.data_roots = data_roots
+        self.arrangement_data = []
+        arrangement_steps = []
+        for ddx in range(len(data_roots)):
+            data_root = data_roots[ddx]
+            index_root = index_roots[ddx]
+            arrangement_indices_file = os.path.join(data_root, index_root, "{}_arrangement_indices_file_all.txt".format(split))
+            if os.path.exists(arrangement_indices_file):
+                with open(arrangement_indices_file, "r") as fh:
+                    arrangement_steps.extend([(os.path.join(data_root, f[0]), f[1]) for f in eval(fh.readline().strip())])
+            else:
+                print("{} does not exist".format(arrangement_indices_file))
+        # only keep the goal, ignore the intermediate steps
+        for filename, step_t in arrangement_steps:
+            if step_t == 0:
+                if "data00026058" in filename or "data00011415" in filename or "data00026061" in filename or "data00700565" in filename:
+                    continue
+                self.arrangement_data.append((filename, step_t))
+        # if specified, filter data
+        if filter_num_moved_objects_range is not None:
+            self.arrangement_data = self.filter_based_on_number_of_moved_objects(filter_num_moved_objects_range)
+        print("{} valid sequences".format(len(self.arrangement_data)))
+        # Data Aug
+        self.data_augmentation = data_augmentation
+        # additive noise
+        self.gp_rescale_factor_range = [12, 20]
+        self.gaussian_scale_range = [0., 0.003]
+        # multiplicative noise
+        self.gamma_shape = 1000.
+        self.gamma_scale = 0.001
+    def filter_based_on_number_of_moved_objects(self, filter_num_moved_objects_range):
+        assert len(list(filter_num_moved_objects_range)) == 2
+        min_num, max_num = filter_num_moved_objects_range
+        print("Remove scenes that have less than {} or more than {} objects being moved".format(min_num, max_num))
+        ok_data = []
+        for filename, step_t in self.arrangement_data:
+            h5 = h5py.File(filename, 'r')
+            moved_objs = h5['moved_objs'][()].split(',')
+            if min_num <= len(moved_objs) <= max_num:
+                ok_data.append((filename, step_t))
+        print("{} valid sequences left".format(len(ok_data)))
+        return ok_data
+    def get_data_idx(self, idx):
+        # Create the datum to return
+        file_idx = np.argmax(idx < self.file_to_count)
+        data = h5py.File(self.data_files[file_idx], 'r')
+        if file_idx > 0:
+            # for lang2sym, idx is always 0
+            idx = idx - self.file_to_count[file_idx - 1]
+        return data, idx, file_idx
+    def add_noise_to_depth(self, depth_img):
+        """ add depth noise """
+        multiplicative_noise = np.random.gamma(self.gamma_shape, self.gamma_scale)
+        depth_img = multiplicative_noise * depth_img
+        return depth_img
+    def add_noise_to_xyz(self, xyz_img, depth_img):
+        """ TODO: remove this code or at least celean it up"""
+        xyz_img = xyz_img.copy()
+        H, W, C = xyz_img.shape
+        gp_rescale_factor = np.random.randint(self.gp_rescale_factor_range[0],
+                                              self.gp_rescale_factor_range[1])
+        gp_scale = np.random.uniform(self.gaussian_scale_range[0],
+                                     self.gaussian_scale_range[1])
+        small_H, small_W = (np.array([H, W]) / gp_rescale_factor).astype(int)
+        additive_noise = np.random.normal(loc=0.0, scale=gp_scale, size=(small_H, small_W, C))
+        additive_noise = cv2.resize(additive_noise, (W, H), interpolation=cv2.INTER_CUBIC)
+        xyz_img[depth_img > 0, :] += additive_noise[depth_img > 0, :]
+        return xyz_img
+    def random_index(self):
+        return self[np.random.randint(len(self))]
+    def _get_rgb(self, h5, idx, ee=True):
+        RGB = "ee_rgb" if ee else "rgb"
+        rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255.  # remove alpha
+        return rgb1
+    def _get_depth(self, h5, idx, ee=True):
+        DEPTH = "ee_depth" if ee else "depth"
+    def _get_images(self, h5, idx, ee=True):
+        if ee:
+            RGB, DEPTH, SEG = "ee_rgb", "ee_depth", "ee_seg"
+            DMIN, DMAX = "ee_depth_min", "ee_depth_max"
+        else:
+            RGB, DEPTH, SEG = "rgb", "depth", "seg"
+            DMIN, DMAX = "depth_min", "depth_max"
+        dmin = h5[DMIN][idx]
+        dmax = h5[DMAX][idx]
+        rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255.  # remove alpha
+        depth1 = h5[DEPTH][idx] / 20000. * (dmax - dmin) + dmin
+        seg1 = img.PNGToNumpy(h5[SEG][idx])
+        valid1 = np.logical_and(depth1 > 0.1, depth1 < 2.)
+        # proj_matrix = h5['proj_matrix'][()]
+        camera = cam.get_camera_from_h5(h5)
+        if self.data_augmentation:
+            depth1 = self.add_noise_to_depth(depth1)
+        xyz1 = cam.compute_xyz(depth1, camera)
+        if self.data_augmentation:
+            xyz1 = self.add_noise_to_xyz(xyz1, depth1)
+        # Transform the point cloud
+        # Here it is...
+        # CAM_POSE = "ee_cam_pose" if ee else "cam_pose"
+        CAM_POSE = "ee_camera_view" if ee else "camera_view"
+        cam_pose = h5[CAM_POSE][idx]
+        if ee:
+            # ee_camera_view has 0s for x, y, z
+            cam_pos = h5["ee_cam_pose"][:][:3, 3]
+            cam_pose[:3, 3] = cam_pos
+        # Get transformed point cloud
+        h, w, d = xyz1.shape
+        xyz1 = xyz1.reshape(h * w, -1)
+        xyz1 = trimesh.transform_points(xyz1, cam_pose)
+        xyz1 = xyz1.reshape(h, w, -1)
+        scene1 = rgb1, depth1, seg1, valid1, xyz1
+        return scene1
+    def __len__(self):
+        return len(self.arrangement_data)
+    def _get_ids(self, h5):
+        """
+        get object ids
+        @param h5:
+        @return:
+        """
+        ids = {}
+        for k in h5.keys():
+            if k.startswith("id_"):
+                ids[k[3:]] = h5[k][()]
+        return ids
+    def get_positive_ratio(self):
+        num_pos = 0
+        for d in self.arrangement_data:
+            filename, step_t = d
+            if step_t == 0:
+                num_pos += 1
+        return (len(self.arrangement_data) - num_pos) * 1.0 / num_pos
+    def get_object_position_vocab_sizes(self):
+        return self.tokenizer.get_object_position_vocab_sizes()
+    def get_vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+    def get_data_index(self, idx):
+        filename = self.arrangement_data[idx]
+        return filename
+    def get_raw_data(self, idx, inference_mode=False, shuffle_object_index=False):
+        """
+        :param idx:
+        :param inference_mode:
+        :param shuffle_object_index: used to test different orders of objects
+        :return:
+        """
+        filename, _ = self.arrangement_data[idx]
+        h5 = h5py.File(filename, 'r')
+        ids = self._get_ids(h5)
+        all_objs = sorted([o for o in ids.keys() if "object_" in o])
+        goal_specification = json.loads(str(np.array(h5["goal_specification"])))
+        num_rearrange_objs = len(goal_specification["rearrange"]["objects"])
+        num_other_objs = len(goal_specification["anchor"]["objects"] + goal_specification["distract"]["objects"])
+        assert len(all_objs) == num_rearrange_objs + num_other_objs, "{}, {}".format(len(all_objs), num_rearrange_objs + num_other_objs)
+        assert num_rearrange_objs <= self.max_num_objects
+        assert num_other_objs <= self.max_num_other_objects
+        # important: only using the last step
+        step_t = num_rearrange_objs
+        target_objs = all_objs[:num_rearrange_objs]
+        other_objs = all_objs[num_rearrange_objs:]
+        structure_parameters = goal_specification["shape"]
+        # Important: ensure the order is correct
+        if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line":
+            target_objs = target_objs[::-1]
+        elif structure_parameters["type"] == "tower" or structure_parameters["type"] == "dinner":
+            target_objs = target_objs
+        else:
+            raise KeyError("{} structure is not recognized".format(structure_parameters["type"]))
+        all_objs = target_objs + other_objs
+        ###################################
+        # getting scene images and point clouds
+        scene = self._get_images(h5, step_t, ee=True)
+        rgb, depth, seg, valid, xyz = scene
+        if inference_mode:
+            initial_scene = scene
+        # getting object point clouds
+        obj_pcs = []
+        obj_pad_mask = []
+        current_pc_poses = []
+        other_obj_pcs = []
+        other_obj_pad_mask = []
+        for obj in all_objs:
+            obj_mask = np.logical_and(seg == ids[obj], valid)
+            if np.sum(obj_mask) <= 0:
+                raise Exception
+            ok, obj_xyz, obj_rgb, _ = get_pts(xyz, rgb, obj_mask, num_pts=self.num_pts)
+            if not ok:
+                raise Exception
+            if obj in target_objs:
+                if self.ignore_rgb:
+                    obj_pcs.append(obj_xyz)
+                else:
+                    obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1))
+                obj_pad_mask.append(0)
+                pc_pose = np.eye(4)
+                pc_pose[:3, 3] = torch.mean(obj_xyz, dim=0).numpy()
+                current_pc_poses.append(pc_pose)
+            elif obj in other_objs:
+                if self.ignore_rgb:
+                    other_obj_pcs.append(obj_xyz)
+                else:
+                    other_obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1))
+                other_obj_pad_mask.append(0)
+            else:
+                raise Exception
+        ###################################
+        # computes goal positions for objects
+        # Important: because of the noises we added to point clouds, the rearranged point clouds will not be perfect
+        if self.use_virtual_structure_frame:
+            goal_structure_pose = tra.euler_matrix(structure_parameters["rotation"][0], structure_parameters["rotation"][1],
+                                              structure_parameters["rotation"][2])
+            goal_structure_pose[:3, 3] = [structure_parameters["position"][0], structure_parameters["position"][1],
+                                     structure_parameters["position"][2]]
+            goal_structure_pose_inv = np.linalg.inv(goal_structure_pose)
+        goal_obj_poses = []
+        current_obj_poses = []
+        goal_pc_poses = []
+        for obj, current_pc_pose in zip(target_objs, current_pc_poses):
+            goal_pose = h5[obj][0]
+            current_pose = h5[obj][step_t]
+            if inference_mode:
+                goal_obj_poses.append(goal_pose)
+                current_obj_poses.append(current_pose)
+            goal_pc_pose = goal_pose @ np.linalg.inv(current_pose) @ current_pc_pose
+            if self.use_virtual_structure_frame:
+                goal_pc_pose = goal_structure_pose_inv @ goal_pc_pose
+            goal_pc_poses.append(goal_pc_pose)
+        # transform current object point cloud to the goal point cloud in the world frame
+        if self.debug:
+            new_obj_pcs = [copy.deepcopy(pc.numpy()) for pc in obj_pcs]
+            for i, obj_pc in enumerate(new_obj_pcs):
+                current_pc_pose = current_pc_poses[i]
+                goal_pc_pose = goal_pc_poses[i]
+                if self.use_virtual_structure_frame:
+                    goal_pc_pose = goal_structure_pose @ goal_pc_pose
+                print("current pc pose", current_pc_pose)
+                print("goal pc pose", goal_pc_pose)
+                goal_pc_transform = goal_pc_pose @ np.linalg.inv(current_pc_pose)
+                print("transform", goal_pc_transform)
+                new_obj_pc = copy.deepcopy(obj_pc)
+                new_obj_pc[:, :3] = trimesh.transform_points(obj_pc[:, :3], goal_pc_transform)
+                print(new_obj_pc.shape)
+                # visualize rearrangement sequence (new_obj_xyzs), the current object before moving (obj_xyz), and other objects
+                new_obj_pcs[i] = new_obj_pc
+                new_obj_pcs[i][:, 3:] = np.tile(np.array([1, 0, 0], dtype=np.float), (new_obj_pc.shape[0], 1))
+                new_obj_rgb_current = np.tile(np.array([0, 1, 0], dtype=np.float), (new_obj_pc.shape[0], 1))
+                show_pcs([pc[:, :3] for pc in new_obj_pcs] + [pc[:, :3] for pc in other_obj_pcs] + [obj_pc[:, :3]],
+                         [pc[:, 3:] for pc in new_obj_pcs] + [pc[:, 3:] for pc in other_obj_pcs] + [new_obj_rgb_current],
+                         add_coordinate_frame=True)
+            show_pcs([pc[:, :3] for pc in new_obj_pcs], [pc[:, 3:] for pc in new_obj_pcs], add_coordinate_frame=True)
+        # pad data
+        for i in range(self.max_num_objects - len(target_objs)):
+            obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32))
+            obj_pad_mask.append(1)
+        for i in range(self.max_num_other_objects - len(other_objs)):
+            other_obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32))
+            other_obj_pad_mask.append(1)
+        ###################################
+        # preparing sentence
+        sentence = []
+        sentence_pad_mask = []
+        # structure parameters
+        # 5 parameters
+        structure_parameters = goal_specification["shape"]
+        if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line":
+            sentence.append((structure_parameters["type"], "shape"))
+            sentence.append((structure_parameters["rotation"][2], "rotation"))
+            sentence.append((structure_parameters["position"][0], "position_x"))
+            sentence.append((structure_parameters["position"][1], "position_y"))
+            if structure_parameters["type"] == "circle":
+                sentence.append((structure_parameters["radius"], "radius"))
+            elif structure_parameters["type"] == "line":
+                sentence.append((structure_parameters["length"] / 2.0, "radius"))
+            for _ in range(5):
+                sentence_pad_mask.append(0)
+        else:
+            sentence.append((structure_parameters["type"], "shape"))
+            sentence.append((structure_parameters["rotation"][2], "rotation"))
+            sentence.append((structure_parameters["position"][0], "position_x"))
+            sentence.append((structure_parameters["position"][1], "position_y"))
+            for _ in range(4):
+                sentence_pad_mask.append(0)
+            sentence.append(("PAD", None))
+            sentence_pad_mask.append(1)
+        ###################################
+        # paddings
+        for i in range(self.max_num_objects - len(target_objs)):
+            goal_pc_poses.append(np.eye(4))
+        ###################################
+        if self.debug:
+            print("---")
+            print("all objects:", all_objs)
+            print("target objects:", target_objs)
+            print("other objects:", other_objs)
+            print("goal specification:", goal_specification)
+            print("sentence:", sentence)
+            show_pcs([pc[:, :3] for pc in obj_pcs + other_obj_pcs], [pc[:, 3:] for pc in obj_pcs + other_obj_pcs], add_coordinate_frame=True)
+        assert len(obj_pcs) == len(goal_pc_poses)
+        ###################################
+        # shuffle the position of objects
+        if shuffle_object_index:
+            shuffle_target_object_indices = list(range(len(target_objs)))
+            random.shuffle(shuffle_target_object_indices)
+            shuffle_object_indices = shuffle_target_object_indices + list(range(len(target_objs), self.max_num_objects))
+            obj_pcs = [obj_pcs[i] for i in shuffle_object_indices]
+            goal_pc_poses = [goal_pc_poses[i] for i in shuffle_object_indices]
+            if inference_mode:
+                goal_obj_poses = [goal_obj_poses[i] for i in shuffle_object_indices]
+                current_obj_poses = [current_obj_poses[i] for i in shuffle_object_indices]
+                target_objs = [target_objs[i] for i in shuffle_target_object_indices]
+                current_pc_poses = [current_pc_poses[i] for i in shuffle_object_indices]
+        ###################################
+        if self.use_virtual_structure_frame:
+            if self.ignore_distractor_objects:
+                # language, structure virtual frame, target objects
+                pcs = obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [2] + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + [0] + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + [0] + obj_pad_mask
+            else:
+                # language, distractor objects, structure virtual frame, target objects
+                pcs = other_obj_pcs + obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [2] + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + [0] + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + other_obj_pad_mask + [0] + obj_pad_mask
+            goal_poses = [goal_structure_pose] + goal_pc_poses
+        else:
+            if self.ignore_distractor_objects:
+                # language, target objects
+                pcs = obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + obj_pad_mask
+            else:
+                # language, distractor objects, target objects
+                pcs = other_obj_pcs + obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + other_obj_pad_mask + obj_pad_mask
+            goal_poses = goal_pc_poses
+        datum = {
+            "pcs": pcs,
+            "sentence": sentence,
+            "goal_poses": goal_poses,
+            "type_index": type_index,
+            "position_index": position_index,
+            "pad_mask": pad_mask,
+            "t": step_t,
+            "filename": filename
+        }
+        if inference_mode:
+            datum["rgb"] = rgb
+            datum["goal_obj_poses"] = goal_obj_poses
+            datum["current_obj_poses"] = current_obj_poses
+            datum["target_objs"] = target_objs
+            datum["initial_scene"] = initial_scene
+            datum["ids"] = ids
+            datum["goal_specification"] = goal_specification
+            datum["current_pc_poses"] = current_pc_poses
+        return datum
+    @staticmethod
+    def convert_to_tensors(datum, tokenizer):
+        tensors = {
+            "pcs": torch.stack(datum["pcs"], dim=0),
+            "sentence": torch.LongTensor(np.array([tokenizer.tokenize(*i) for i in datum["sentence"]])),
+            "goal_poses": torch.FloatTensor(np.array(datum["goal_poses"])),
+            "type_index": torch.LongTensor(np.array(datum["type_index"])),
+            "position_index": torch.LongTensor(np.array(datum["position_index"])),
+            "pad_mask": torch.LongTensor(np.array(datum["pad_mask"])),
+            "t": datum["t"],
+            "filename": datum["filename"]
+        }
+        return tensors
+    def __getitem__(self, idx):
+        datum = self.convert_to_tensors(self.get_raw_data(idx, shuffle_object_index=self.shuffle_object_index),
+                                        self.tokenizer)
+        return datum
+    def single_datum_to_batch(self, x, num_samples, device, inference_mode=True):
+        tensor_x = {}
+        tensor_x["pcs"] = x["pcs"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1)
+        tensor_x["sentence"] = x["sentence"].to(device)[None, :].repeat(num_samples, 1)
+        if not inference_mode:
+            tensor_x["goal_poses"] = x["goal_poses"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1)
+        tensor_x["type_index"] = x["type_index"].to(device)[None, :].repeat(num_samples, 1)
+        tensor_x["position_index"] = x["position_index"].to(device)[None, :].repeat(num_samples, 1)
+        tensor_x["pad_mask"] = x["pad_mask"].to(device)[None, :].repeat(num_samples, 1)
+        return tensor_x
+def compute_min_max(dataloader):
+    # tensor([-0.3557, -0.3847,  0.0000, -1.0000, -1.0000, -0.4759, -1.0000, -1.0000,
+    #         -0.9079, -0.8668, -0.9105, -0.4186])
+    # tensor([0.3915, 0.3494, 0.3267, 1.0000, 1.0000, 0.8961, 1.0000, 1.0000, 0.8194,
+    #         0.4787, 0.6421, 1.0000])
+    # tensor([0.0918, -0.3758, 0.0000, -1.0000, -1.0000, 0.0000, -1.0000, -1.0000,
+    #         -0.0000, 0.0000, 0.0000, 1.0000])
+    # tensor([0.9199, 0.3710, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 1.0000, -0.0000,
+    #         0.0000, 0.0000, 1.0000])
+    min_value = torch.ones(16) * 10000
+    max_value = torch.ones(16) * -10000
+    for d in tqdm(dataloader):
+        goal_poses = d["goal_poses"]
+        goal_poses = goal_poses.reshape(-1, 16)
+        current_max, _ = torch.max(goal_poses, dim=0)
+        current_min, _ = torch.min(goal_poses, dim=0)
+        max_value[max_value < current_max] = current_max[max_value < current_max]
+        max_value[max_value > current_min] = current_min[max_value > current_min]
+    print(f"{min_value} - {max_value}")
+if __name__ == "__main__":
+    tokenizer = Tokenizer("/home/weiyu/data_drive/data_new_objects/type_vocabs_coarse.json")
+    data_roots = []
+    index_roots = []
+    for shape, index in [("circle", "index_10k"), ("line", "index_10k"), ("stacking", "index_10k"), ("dinner", "index_10k")]:
+        data_roots.append("/home/weiyu/data_drive/data_new_objects/examples_{}_new_objects/result".format(shape))
+        index_roots.append(index)
+    dataset = SemanticArrangementDataset(data_roots=data_roots,
+                                         index_roots=index_roots,
+                                         split="valid", tokenizer=tokenizer,
+                                         max_num_target_objects=7,
+                                         max_num_distractor_objects=5,
+                                         max_num_shape_parameters=5,
+                                         max_num_rearrange_features=0,
+                                         max_num_anchor_features=0,
+                                         num_pts=1024,
+                                         use_virtual_structure_frame=True,
+                                         ignore_distractor_objects=True,
+                                         ignore_rgb=True,
+                                         filter_num_moved_objects_range=None,  # [5, 5]
+                                         data_augmentation=False,
+                                         shuffle_object_index=False,
+                                         debug=False)
+    # print(len(dataset))
+    # for d in dataset:
+    #     print("\n\n" + "="*100)
+    dataloader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=8)
+    for i, d in enumerate(tqdm(dataloader)):
+        pass
+        # for k in d:
+        #     if isinstance(d[k], torch.Tensor):
+        #         print("--size", k, d[k].shape)
+        # for k in d:
+        #     print(k, d[k])
+        #
+        # input("next?")

src/StructDiffusion/data/semantic_arrangement_demo.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import copy
+import cv2
+import h5py
+import numpy as np
+import os
+import trimesh
+import torch
+from tqdm import tqdm
+import json
+import random
+from torch.utils.data import DataLoader
+# Local imports
+from StructDiffusion.utils.rearrangement import show_pcs, get_pts, combine_and_sample_xyzs
+from StructDiffusion.language.tokenizer import Tokenizer
+import StructDiffusion.utils.brain2.camera as cam
+import StructDiffusion.utils.brain2.image as img
+import StructDiffusion.utils.transformations as tra
+class SemanticArrangementDataset(torch.utils.data.Dataset):
+    def __init__(self, data_root, tokenizer,
+                 max_num_target_objects=11, max_num_distractor_objects=5,
+                 max_num_shape_parameters=7, max_num_rearrange_features=1, max_num_anchor_features=3,
+                 num_pts=1024,
+                 use_virtual_structure_frame=True, ignore_distractor_objects=True, ignore_rgb=True,
+                 filter_num_moved_objects_range=None, shuffle_object_index=False,
+                 data_augmentation=True, debug=False, **kwargs):
+        """
+        Note: setting filter_num_moved_objects_range=[k, k] and max_num_objects=k will create no padding for target objs
+        :param data_root:
+        :param split: train, valid, or test
+        :param shuffle_object_index: whether to shuffle the positions of target objects and other objects in the sequence
+        :param debug:
+        :param max_num_shape_parameters:
+        :param max_num_objects:
+        :param max_num_rearrange_features:
+        :param max_num_anchor_features:
+        :param num_pts:
+        :param use_stored_arrangement_indices:
+        :param kwargs:
+        """
+        self.use_virtual_structure_frame = use_virtual_structure_frame
+        self.ignore_distractor_objects = ignore_distractor_objects
+        self.ignore_rgb = ignore_rgb and not debug
+        self.num_pts = num_pts
+        self.debug = debug
+        self.max_num_objects = max_num_target_objects
+        self.max_num_other_objects = max_num_distractor_objects
+        self.max_num_shape_parameters = max_num_shape_parameters
+        self.max_num_rearrange_features = max_num_rearrange_features
+        self.max_num_anchor_features = max_num_anchor_features
+        self.shuffle_object_index = shuffle_object_index
+        # used to tokenize the language part
+        self.tokenizer = tokenizer
+        # retrieve data
+        self.data_root = data_root
+        self.arrangement_data = []
+        for filename in os.listdir(data_root):
+            if ".h5" in filename:
+                self.arrangement_data.append((os.path.join(data_root, filename), 0))
+        print("{} valid sequences".format(len(self.arrangement_data)))
+        # Data Aug
+        self.data_augmentation = data_augmentation
+        # additive noise
+        self.gp_rescale_factor_range = [12, 20]
+        self.gaussian_scale_range = [0., 0.003]
+        # multiplicative noise
+        self.gamma_shape = 1000.
+        self.gamma_scale = 0.001
+    def filter_based_on_number_of_moved_objects(self, filter_num_moved_objects_range):
+        assert len(list(filter_num_moved_objects_range)) == 2
+        min_num, max_num = filter_num_moved_objects_range
+        print("Remove scenes that have less than {} or more than {} objects being moved".format(min_num, max_num))
+        ok_data = []
+        for filename, step_t in self.arrangement_data:
+            h5 = h5py.File(filename, 'r')
+            moved_objs = h5['moved_objs'][()].split(',')
+            if min_num <= len(moved_objs) <= max_num:
+                ok_data.append((filename, step_t))
+        print("{} valid sequences left".format(len(ok_data)))
+        return ok_data
+    def get_data_idx(self, idx):
+        # Create the datum to return
+        file_idx = np.argmax(idx < self.file_to_count)
+        data = h5py.File(self.data_files[file_idx], 'r')
+        if file_idx > 0:
+            # for lang2sym, idx is always 0
+            idx = idx - self.file_to_count[file_idx - 1]
+        return data, idx, file_idx
+    def add_noise_to_depth(self, depth_img):
+        """ add depth noise """
+        multiplicative_noise = np.random.gamma(self.gamma_shape, self.gamma_scale)
+        depth_img = multiplicative_noise * depth_img
+        return depth_img
+    def add_noise_to_xyz(self, xyz_img, depth_img):
+        """ TODO: remove this code or at least celean it up"""
+        xyz_img = xyz_img.copy()
+        H, W, C = xyz_img.shape
+        gp_rescale_factor = np.random.randint(self.gp_rescale_factor_range[0],
+                                              self.gp_rescale_factor_range[1])
+        gp_scale = np.random.uniform(self.gaussian_scale_range[0],
+                                     self.gaussian_scale_range[1])
+        small_H, small_W = (np.array([H, W]) / gp_rescale_factor).astype(int)
+        additive_noise = np.random.normal(loc=0.0, scale=gp_scale, size=(small_H, small_W, C))
+        additive_noise = cv2.resize(additive_noise, (W, H), interpolation=cv2.INTER_CUBIC)
+        xyz_img[depth_img > 0, :] += additive_noise[depth_img > 0, :]
+        return xyz_img
+    def random_index(self):
+        return self[np.random.randint(len(self))]
+    def _get_rgb(self, h5, idx, ee=True):
+        RGB = "ee_rgb" if ee else "rgb"
+        rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255.  # remove alpha
+        return rgb1
+    def _get_depth(self, h5, idx, ee=True):
+        DEPTH = "ee_depth" if ee else "depth"
+    def _get_images(self, h5, idx, ee=True):
+        if ee:
+            RGB, DEPTH, SEG = "ee_rgb", "ee_depth", "ee_seg"
+            DMIN, DMAX = "ee_depth_min", "ee_depth_max"
+        else:
+            RGB, DEPTH, SEG = "rgb", "depth", "seg"
+            DMIN, DMAX = "depth_min", "depth_max"
+        dmin = h5[DMIN][idx]
+        dmax = h5[DMAX][idx]
+        rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255.  # remove alpha
+        depth1 = h5[DEPTH][idx] / 20000. * (dmax - dmin) + dmin
+        seg1 = img.PNGToNumpy(h5[SEG][idx])
+        valid1 = np.logical_and(depth1 > 0.1, depth1 < 2.)
+        # proj_matrix = h5['proj_matrix'][()]
+        camera = cam.get_camera_from_h5(h5)
+        if self.data_augmentation:
+            depth1 = self.add_noise_to_depth(depth1)
+        xyz1 = cam.compute_xyz(depth1, camera)
+        if self.data_augmentation:
+            xyz1 = self.add_noise_to_xyz(xyz1, depth1)
+        # Transform the point cloud
+        # Here it is...
+        # CAM_POSE = "ee_cam_pose" if ee else "cam_pose"
+        CAM_POSE = "ee_camera_view" if ee else "camera_view"
+        cam_pose = h5[CAM_POSE][idx]
+        if ee:
+            # ee_camera_view has 0s for x, y, z
+            cam_pos = h5["ee_cam_pose"][:][:3, 3]
+            cam_pose[:3, 3] = cam_pos
+        # Get transformed point cloud
+        h, w, d = xyz1.shape
+        xyz1 = xyz1.reshape(h * w, -1)
+        xyz1 = trimesh.transform_points(xyz1, cam_pose)
+        xyz1 = xyz1.reshape(h, w, -1)
+        scene1 = rgb1, depth1, seg1, valid1, xyz1
+        return scene1
+    def __len__(self):
+        return len(self.arrangement_data)
+    def _get_ids(self, h5):
+        """
+        get object ids
+        @param h5:
+        @return:
+        """
+        ids = {}
+        for k in h5.keys():
+            if k.startswith("id_"):
+                ids[k[3:]] = h5[k][()]
+        return ids
+    def get_positive_ratio(self):
+        num_pos = 0
+        for d in self.arrangement_data:
+            filename, step_t = d
+            if step_t == 0:
+                num_pos += 1
+        return (len(self.arrangement_data) - num_pos) * 1.0 / num_pos
+    def get_object_position_vocab_sizes(self):
+        return self.tokenizer.get_object_position_vocab_sizes()
+    def get_vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+    def get_data_index(self, idx):
+        filename = self.arrangement_data[idx]
+        return filename
+    def get_raw_data(self, idx, inference_mode=False, shuffle_object_index=False):
+        """
+        :param idx:
+        :param inference_mode:
+        :param shuffle_object_index: used to test different orders of objects
+        :return:
+        """
+        filename, _ = self.arrangement_data[idx]
+        h5 = h5py.File(filename, 'r')
+        ids = self._get_ids(h5)
+        all_objs = sorted([o for o in ids.keys() if "object_" in o])
+        goal_specification = json.loads(str(np.array(h5["goal_specification"])))
+        num_rearrange_objs = len(goal_specification["rearrange"]["objects"])
+        num_other_objs = len(goal_specification["anchor"]["objects"] + goal_specification["distract"]["objects"])
+        assert len(all_objs) == num_rearrange_objs + num_other_objs, "{}, {}".format(len(all_objs), num_rearrange_objs + num_other_objs)
+        assert num_rearrange_objs <= self.max_num_objects
+        assert num_other_objs <= self.max_num_other_objects
+        # important: only using the last step
+        step_t = num_rearrange_objs
+        target_objs = all_objs[:num_rearrange_objs]
+        other_objs = all_objs[num_rearrange_objs:]
+        structure_parameters = goal_specification["shape"]
+        # Important: ensure the order is correct
+        if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line":
+            target_objs = target_objs[::-1]
+        elif structure_parameters["type"] == "tower" or structure_parameters["type"] == "dinner":
+            target_objs = target_objs
+        else:
+            raise KeyError("{} structure is not recognized".format(structure_parameters["type"]))
+        all_objs = target_objs + other_objs
+        ###################################
+        # getting scene images and point clouds
+        scene = self._get_images(h5, step_t, ee=True)
+        rgb, depth, seg, valid, xyz = scene
+        if inference_mode:
+            initial_scene = scene
+        # getting object point clouds
+        obj_pcs = []
+        obj_pad_mask = []
+        current_pc_poses = []
+        other_obj_pcs = []
+        other_obj_pad_mask = []
+        for obj in all_objs:
+            obj_mask = np.logical_and(seg == ids[obj], valid)
+            if np.sum(obj_mask) <= 0:
+                raise Exception
+            ok, obj_xyz, obj_rgb, _ = get_pts(xyz, rgb, obj_mask, num_pts=self.num_pts)
+            if not ok:
+                raise Exception
+            if obj in target_objs:
+                if self.ignore_rgb:
+                    obj_pcs.append(obj_xyz)
+                else:
+                    obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1))
+                obj_pad_mask.append(0)
+                pc_pose = np.eye(4)
+                pc_pose[:3, 3] = torch.mean(obj_xyz, dim=0).numpy()
+                current_pc_poses.append(pc_pose)
+            elif obj in other_objs:
+                if self.ignore_rgb:
+                    other_obj_pcs.append(obj_xyz)
+                else:
+                    other_obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1))
+                other_obj_pad_mask.append(0)
+            else:
+                raise Exception
+        ###################################
+        # computes goal positions for objects
+        # Important: because of the noises we added to point clouds, the rearranged point clouds will not be perfect
+        if self.use_virtual_structure_frame:
+            goal_structure_pose = tra.euler_matrix(structure_parameters["rotation"][0], structure_parameters["rotation"][1],
+                                              structure_parameters["rotation"][2])
+            goal_structure_pose[:3, 3] = [structure_parameters["position"][0], structure_parameters["position"][1],
+                                     structure_parameters["position"][2]]
+            goal_structure_pose_inv = np.linalg.inv(goal_structure_pose)
+        goal_obj_poses = []
+        current_obj_poses = []
+        goal_pc_poses = []
+        for obj, current_pc_pose in zip(target_objs, current_pc_poses):
+            goal_pose = h5[obj][0]
+            current_pose = h5[obj][step_t]
+            if inference_mode:
+                goal_obj_poses.append(goal_pose)
+                current_obj_poses.append(current_pose)
+            goal_pc_pose = goal_pose @ np.linalg.inv(current_pose) @ current_pc_pose
+            if self.use_virtual_structure_frame:
+                goal_pc_pose = goal_structure_pose_inv @ goal_pc_pose
+            goal_pc_poses.append(goal_pc_pose)
+        # transform current object point cloud to the goal point cloud in the world frame
+        if self.debug:
+            new_obj_pcs = [copy.deepcopy(pc.numpy()) for pc in obj_pcs]
+            for i, obj_pc in enumerate(new_obj_pcs):
+                current_pc_pose = current_pc_poses[i]
+                goal_pc_pose = goal_pc_poses[i]
+                if self.use_virtual_structure_frame:
+                    goal_pc_pose = goal_structure_pose @ goal_pc_pose
+                print("current pc pose", current_pc_pose)
+                print("goal pc pose", goal_pc_pose)
+                goal_pc_transform = goal_pc_pose @ np.linalg.inv(current_pc_pose)
+                print("transform", goal_pc_transform)
+                new_obj_pc = copy.deepcopy(obj_pc)
+                new_obj_pc[:, :3] = trimesh.transform_points(obj_pc[:, :3], goal_pc_transform)
+                print(new_obj_pc.shape)
+                # visualize rearrangement sequence (new_obj_xyzs), the current object before moving (obj_xyz), and other objects
+                new_obj_pcs[i] = new_obj_pc
+                new_obj_pcs[i][:, 3:] = np.tile(np.array([1, 0, 0], dtype=np.float), (new_obj_pc.shape[0], 1))
+                new_obj_rgb_current = np.tile(np.array([0, 1, 0], dtype=np.float), (new_obj_pc.shape[0], 1))
+                show_pcs([pc[:, :3] for pc in new_obj_pcs] + [pc[:, :3] for pc in other_obj_pcs] + [obj_pc[:, :3]],
+                         [pc[:, 3:] for pc in new_obj_pcs] + [pc[:, 3:] for pc in other_obj_pcs] + [new_obj_rgb_current],
+                         add_coordinate_frame=True)
+            show_pcs([pc[:, :3] for pc in new_obj_pcs], [pc[:, 3:] for pc in new_obj_pcs], add_coordinate_frame=True)
+        # pad data
+        for i in range(self.max_num_objects - len(target_objs)):
+            obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32))
+            obj_pad_mask.append(1)
+        for i in range(self.max_num_other_objects - len(other_objs)):
+            other_obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32))
+            other_obj_pad_mask.append(1)
+        ###################################
+        # preparing sentence
+        sentence = []
+        sentence_pad_mask = []
+        # structure parameters
+        # 5 parameters
+        structure_parameters = goal_specification["shape"]
+        if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line":
+            sentence.append((structure_parameters["type"], "shape"))
+            sentence.append((structure_parameters["rotation"][2], "rotation"))
+            sentence.append((structure_parameters["position"][0], "position_x"))
+            sentence.append((structure_parameters["position"][1], "position_y"))
+            if structure_parameters["type"] == "circle":
+                sentence.append((structure_parameters["radius"], "radius"))
+            elif structure_parameters["type"] == "line":
+                sentence.append((structure_parameters["length"] / 2.0, "radius"))
+            for _ in range(5):
+                sentence_pad_mask.append(0)
+        else:
+            sentence.append((structure_parameters["type"], "shape"))
+            sentence.append((structure_parameters["rotation"][2], "rotation"))
+            sentence.append((structure_parameters["position"][0], "position_x"))
+            sentence.append((structure_parameters["position"][1], "position_y"))
+            for _ in range(4):
+                sentence_pad_mask.append(0)
+            sentence.append(("PAD", None))
+            sentence_pad_mask.append(1)
+        ###################################
+        # paddings
+        for i in range(self.max_num_objects - len(target_objs)):
+            goal_pc_poses.append(np.eye(4))
+        ###################################
+        if self.debug:
+            print("---")
+            print("all objects:", all_objs)
+            print("target objects:", target_objs)
+            print("other objects:", other_objs)
+            print("goal specification:", goal_specification)
+            print("sentence:", sentence)
+            show_pcs([pc[:, :3] for pc in obj_pcs + other_obj_pcs], [pc[:, 3:] for pc in obj_pcs + other_obj_pcs], add_coordinate_frame=True)
+        assert len(obj_pcs) == len(goal_pc_poses)
+        ###################################
+        # shuffle the position of objects
+        if shuffle_object_index:
+            shuffle_target_object_indices = list(range(len(target_objs)))
+            random.shuffle(shuffle_target_object_indices)
+            shuffle_object_indices = shuffle_target_object_indices + list(range(len(target_objs), self.max_num_objects))
+            obj_pcs = [obj_pcs[i] for i in shuffle_object_indices]
+            goal_pc_poses = [goal_pc_poses[i] for i in shuffle_object_indices]
+            if inference_mode:
+                goal_obj_poses = [goal_obj_poses[i] for i in shuffle_object_indices]
+                current_obj_poses = [current_obj_poses[i] for i in shuffle_object_indices]
+                target_objs = [target_objs[i] for i in shuffle_target_object_indices]
+                current_pc_poses = [current_pc_poses[i] for i in shuffle_object_indices]
+        ###################################
+        if self.use_virtual_structure_frame:
+            if self.ignore_distractor_objects:
+                # language, structure virtual frame, target objects
+                pcs = obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [2] + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + [0] + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + [0] + obj_pad_mask
+            else:
+                # language, distractor objects, structure virtual frame, target objects
+                pcs = other_obj_pcs + obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [2] + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + [0] + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + other_obj_pad_mask + [0] + obj_pad_mask
+            goal_poses = [goal_structure_pose] + goal_pc_poses
+        else:
+            if self.ignore_distractor_objects:
+                # language, target objects
+                pcs = obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + obj_pad_mask
+            else:
+                # language, distractor objects, target objects
+                pcs = other_obj_pcs + obj_pcs
+                type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [3] * self.max_num_objects
+                position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + list(range(self.max_num_objects))
+                pad_mask = sentence_pad_mask + other_obj_pad_mask + obj_pad_mask
+            goal_poses = goal_pc_poses
+        datum = {
+            "pcs": pcs,
+            "sentence": sentence,
+            "goal_poses": goal_poses,
+            "type_index": type_index,
+            "position_index": position_index,
+            "pad_mask": pad_mask,
+            "t": step_t,
+            "filename": filename
+        }
+        if inference_mode:
+            datum["rgb"] = rgb
+            datum["goal_obj_poses"] = goal_obj_poses
+            datum["current_obj_poses"] = current_obj_poses
+            datum["target_objs"] = target_objs
+            datum["initial_scene"] = initial_scene
+            datum["ids"] = ids
+            datum["goal_specification"] = goal_specification
+            datum["current_pc_poses"] = current_pc_poses
+        return datum
+    @staticmethod
+    def convert_to_tensors(datum, tokenizer):
+        tensors = {
+            "pcs": torch.stack(datum["pcs"], dim=0),
+            "sentence": torch.LongTensor(np.array([tokenizer.tokenize(*i) for i in datum["sentence"]])),
+            "goal_poses": torch.FloatTensor(np.array(datum["goal_poses"])),
+            "type_index": torch.LongTensor(np.array(datum["type_index"])),
+            "position_index": torch.LongTensor(np.array(datum["position_index"])),
+            "pad_mask": torch.LongTensor(np.array(datum["pad_mask"])),
+            "t": datum["t"],
+            "filename": datum["filename"]
+        }
+        return tensors
+    def __getitem__(self, idx):
+        datum = self.convert_to_tensors(self.get_raw_data(idx, shuffle_object_index=self.shuffle_object_index),
+                                        self.tokenizer)
+        return datum
+    def single_datum_to_batch(self, x, num_samples, device, inference_mode=True):
+        tensor_x = {}
+        tensor_x["pcs"] = x["pcs"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1)
+        tensor_x["sentence"] = x["sentence"].to(device)[None, :].repeat(num_samples, 1)
+        if not inference_mode:
+            tensor_x["goal_poses"] = x["goal_poses"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1)
+        tensor_x["type_index"] = x["type_index"].to(device)[None, :].repeat(num_samples, 1)
+        tensor_x["position_index"] = x["position_index"].to(device)[None, :].repeat(num_samples, 1)
+        tensor_x["pad_mask"] = x["pad_mask"].to(device)[None, :].repeat(num_samples, 1)
+        return tensor_x
+def compute_min_max(dataloader):
+    # tensor([-0.3557, -0.3847,  0.0000, -1.0000, -1.0000, -0.4759, -1.0000, -1.0000,
+    #         -0.9079, -0.8668, -0.9105, -0.4186])
+    # tensor([0.3915, 0.3494, 0.3267, 1.0000, 1.0000, 0.8961, 1.0000, 1.0000, 0.8194,
+    #         0.4787, 0.6421, 1.0000])
+    # tensor([0.0918, -0.3758, 0.0000, -1.0000, -1.0000, 0.0000, -1.0000, -1.0000,
+    #         -0.0000, 0.0000, 0.0000, 1.0000])
+    # tensor([0.9199, 0.3710, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 1.0000, -0.0000,
+    #         0.0000, 0.0000, 1.0000])
+    min_value = torch.ones(16) * 10000
+    max_value = torch.ones(16) * -10000
+    for d in tqdm(dataloader):
+        goal_poses = d["goal_poses"]
+        goal_poses = goal_poses.reshape(-1, 16)
+        current_max, _ = torch.max(goal_poses, dim=0)
+        current_min, _ = torch.min(goal_poses, dim=0)
+        max_value[max_value < current_max] = current_max[max_value < current_max]
+        max_value[max_value > current_min] = current_min[max_value > current_min]
+    print(f"{min_value} - {max_value}")
+if __name__ == "__main__":
+    tokenizer = Tokenizer("/home/weiyu/data_drive/data_new_objects/type_vocabs_coarse.json")
+    data_roots = []
+    index_roots = []
+    for shape, index in [("circle", "index_10k"), ("line", "index_10k"), ("stacking", "index_10k"), ("dinner", "index_10k")]:
+        data_roots.append("/home/weiyu/data_drive/data_new_objects/examples_{}_new_objects/result".format(shape))
+        index_roots.append(index)
+    dataset = SemanticArrangementDataset(data_roots=data_roots,
+                                         index_roots=index_roots,
+                                         split="valid", tokenizer=tokenizer,
+                                         max_num_target_objects=7,
+                                         max_num_distractor_objects=5,
+                                         max_num_shape_parameters=5,
+                                         max_num_rearrange_features=0,
+                                         max_num_anchor_features=0,
+                                         num_pts=1024,
+                                         use_virtual_structure_frame=True,
+                                         ignore_distractor_objects=True,
+                                         ignore_rgb=True,
+                                         filter_num_moved_objects_range=None,  # [5, 5]
+                                         data_augmentation=False,
+                                         shuffle_object_index=False,
+                                         debug=False)
+    # print(len(dataset))
+    # for d in dataset:
+    #     print("\n\n" + "="*100)
+    dataloader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=8)
+    for i, d in enumerate(tqdm(dataloader)):
+        pass
+        # for k in d:
+        #     if isinstance(d[k], torch.Tensor):
+        #         print("--size", k, d[k].shape)
+        # for k in d:
+        #     print(k, d[k])
+        #
+        # input("next?")

src/StructDiffusion/diffusion/__init__.py ADDED Viewed

File without changes

src/StructDiffusion/diffusion/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (181 Bytes). View file

src/StructDiffusion/diffusion/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (185 Bytes). View file

src/StructDiffusion/diffusion/__pycache__/noise_schedule.cpython-37.pyc ADDED Viewed

Binary file (2.57 kB). View file

src/StructDiffusion/diffusion/__pycache__/noise_schedule.cpython-38.pyc ADDED Viewed

Binary file (2.57 kB). View file

src/StructDiffusion/diffusion/__pycache__/pose_conversion.cpython-37.pyc ADDED Viewed

Binary file (2.25 kB). View file

src/StructDiffusion/diffusion/__pycache__/pose_conversion.cpython-38.pyc ADDED Viewed

Binary file (2.27 kB). View file

src/StructDiffusion/diffusion/__pycache__/sampler.cpython-37.pyc ADDED Viewed

Binary file (5.74 kB). View file

src/StructDiffusion/diffusion/__pycache__/sampler.cpython-38.pyc ADDED Viewed

Binary file (5.71 kB). View file

src/StructDiffusion/diffusion/noise_schedule.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import math
+import torch
+import torch.nn.functional as F
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule as proposed in https://arxiv.org/abs/2102.09672
+    """
+    steps = timesteps + 1
+    x = torch.linspace(0, timesteps, steps)
+    alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0.0001, 0.9999)
+def linear_beta_schedule(timesteps):
+    beta_start = 0.0001
+    beta_end = 0.02
+    return torch.linspace(beta_start, beta_end, timesteps)
+def quadratic_beta_schedule(timesteps):
+    beta_start = 0.0001
+    beta_end = 0.02
+    return torch.linspace(beta_start**0.5, beta_end**0.5, timesteps) ** 2
+def sigmoid_beta_schedule(timesteps):
+    beta_start = 0.0001
+    beta_end = 0.02
+    betas = torch.linspace(-6, 6, timesteps)
+    return torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+class NoiseSchedule:
+    def __init__(self, timesteps=200):
+        self.timesteps = timesteps
+        # define beta schedule
+        self.betas = linear_beta_schedule(timesteps=timesteps)
+        # self.betas = cosine_beta_schedule(timesteps=timesteps)
+        # define alphas
+        self.alphas = 1. - self.betas
+        # alphas_cumprod: alpha bar
+        self.alphas_cumprod = torch.cumprod(self.alphas, axis=0)
+        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
+        self.sqrt_recip_alphas = torch.sqrt(1.0 / self.alphas)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = self.betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+def extract(a, t, x_shape):
+    batch_size = t.shape[0]
+    out = a.gather(-1, t.cpu())
+    return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device)
+# forward diffusion (using the nice property)
+def q_sample(x_start, t, noise_schedule, noise=None):
+    if noise is None:
+        noise = torch.randn_like(x_start)
+    sqrt_alphas_cumprod_t = extract(noise_schedule.sqrt_alphas_cumprod, t, x_start.shape)
+    # print("sqrt_alphas_cumprod_t", sqrt_alphas_cumprod_t)
+    sqrt_one_minus_alphas_cumprod_t = extract(
+        noise_schedule.sqrt_one_minus_alphas_cumprod, t, x_start.shape
+    )
+    # print("sqrt_one_minus_alphas_cumprod_t", sqrt_one_minus_alphas_cumprod_t)
+    # print("noise", noise)
+    return sqrt_alphas_cumprod_t * x_start + sqrt_one_minus_alphas_cumprod_t * noise

src/StructDiffusion/diffusion/pose_conversion.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import torch
+import pytorch3d.transforms as tra3d
+from StructDiffusion.utils.rotation_continuity import compute_rotation_matrix_from_ortho6d
+def get_diffusion_variables_from_9D_actions(struct_xyztheta_inputs, obj_xyztheta_inputs):
+    # important: we need to get the first two columns, not first two rows
+    # array([[ 3,  4,  5],
+    #   [ 6,  7,  8],
+    #   [ 9, 10, 11]])
+    xyz_6d_idxs = [0, 1, 2, 3, 6, 9, 4, 7, 10]
+    # print(batch_data["obj_xyztheta_inputs"].shape)
+    # print(batch_data["struct_xyztheta_inputs"].shape)
+    # only get the first and second columns of rotation
+    obj_xyztheta_inputs = obj_xyztheta_inputs[:, :, xyz_6d_idxs]  # B, N, 9
+    struct_xyztheta_inputs = struct_xyztheta_inputs[:, :, xyz_6d_idxs]  # B, 1, 9
+    x = torch.cat([struct_xyztheta_inputs, obj_xyztheta_inputs], dim=1)  # B, 1 + N, 9
+    # print(x.shape)
+    return x
+def get_diffusion_variables_from_H(poses):
+    """
+    [[0,1,2,3],
+    [4,5,6,7],
+    [8,9,10,11],
+    [12,13,14,15]
+    :param obj_xyztheta_inputs: B, N, 4, 4
+    :return:
+    """
+    xyz_6d_idxs = [3, 7, 11, 0, 4, 8, 1, 5, 9]
+    B, N, _, _ = poses.shape
+    x = poses.reshape(B, N, 16)[:, :, xyz_6d_idxs]  # B, N, 9
+    return x
+def get_struct_objs_poses(x):
+    on_gpu = x.is_cuda
+    if not on_gpu:
+        x = x.cuda()
+    # assert x.is_cuda, "compute_rotation_matrix_from_ortho6d requires input to be on gpu"
+    device = x.device
+    # important: the noisy x can go out of bounds
+    x = torch.clamp(x, min=-1, max=1)
+    # x: B, 1 + N, 9
+    B = x.shape[0]
+    N = x.shape[1] - 1
+    # compute_rotation_matrix_from_ortho6d takes in [B, 6], outputs [B, 3, 3]
+    x_6d = x[:, :, 3:].reshape(-1, 6)
+    x_rot = compute_rotation_matrix_from_ortho6d(x_6d).reshape(B, N+1, 3, 3)  # B, 1 + N, 3, 3
+    x_trans = x[:, :, :3] # B, 1 + N, 3
+    x_full = torch.eye(4).repeat(B, 1 + N, 1, 1).to(device)
+    x_full[:, :, :3, :3] = x_rot
+    x_full[:, :, :3, 3] = x_trans
+    struct_pose = x_full[:, 0].unsqueeze(1) # B, 1, 4, 4
+    pc_poses_in_struct = x_full[:, 1:] # B, N, 4, 4
+    if not on_gpu:
+        struct_pose = struct_pose.cpu()
+        pc_poses_in_struct = pc_poses_in_struct.cpu()
+    return struct_pose, pc_poses_in_struct
+def compute_current_and_goal_pc_poses(obj_xyzs, struct_pose, pc_poses_in_struct):
+    device = obj_xyzs.device
+    # obj_xyzs: B, N, P, 3
+    # struct_pose: B, 1, 4, 4
+    # pc_poses_in_struct: B, N, 4, 4
+    B, N, _, _ = pc_poses_in_struct.shape
+    _, _, P, _ = obj_xyzs.shape
+    current_pc_poses = torch.eye(4).repeat(B, N, 1, 1).to(device)  # B, N, 4, 4
+    # print(torch.mean(obj_xyzs, dim=2).shape)
+    current_pc_poses[:, :, :3, 3] = torch.mean(obj_xyzs, dim=2)  # B, N, 4, 4
+    struct_pose = struct_pose.repeat(1, N, 1, 1)  # B, N, 4, 4
+    struct_pose = struct_pose.reshape(B * N, 4, 4)  # B x 1, 4, 4
+    pc_poses_in_struct = pc_poses_in_struct.reshape(B * N, 4, 4)  # B x N, 4, 4
+    goal_pc_poses = struct_pose @ pc_poses_in_struct  # B x N, 4, 4
+    goal_pc_poses = goal_pc_poses.reshape(B, N, 4, 4)  # B, N, 4, 4
+    return current_pc_poses, goal_pc_poses

src/StructDiffusion/diffusion/sampler.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import torch
+from tqdm import tqdm
+import pytorch3d.transforms as tra3d
+from StructDiffusion.diffusion.noise_schedule import extract
+from StructDiffusion.diffusion.pose_conversion import get_struct_objs_poses
+from StructDiffusion.utils.batch_inference import move_pc_and_create_scene_simple, visualize_batch_pcs, move_pc_and_create_scene_new
+class Sampler:
+    def __init__(self, model_class, checkpoint_path, device, debug=False):
+        self.debug = debug
+        self.device = device
+        self.model = model_class.load_from_checkpoint(checkpoint_path)
+        self.backbone = self.model.model
+        self.backbone.to(device)
+        self.backbone.eval()
+    def sample(self, batch, num_poses):
+        noise_schedule = self.model.noise_schedule
+        B = batch["pcs"].shape[0]
+        x_noisy = torch.randn((B, num_poses, 9), device=self.device)
+        xs = []
+        for t_index in tqdm(reversed(range(0, noise_schedule.timesteps)),
+                            desc='sampling loop time step', total=noise_schedule.timesteps):
+            t = torch.full((B,), t_index, device=self.device, dtype=torch.long)
+            # noise schedule
+            betas_t = extract(noise_schedule.betas, t, x_noisy.shape)
+            sqrt_one_minus_alphas_cumprod_t = extract(noise_schedule.sqrt_one_minus_alphas_cumprod, t, x_noisy.shape)
+            sqrt_recip_alphas_t = extract(noise_schedule.sqrt_recip_alphas, t, x_noisy.shape)
+            # predict noise
+            pcs = batch["pcs"]
+            sentence = batch["sentence"]
+            type_index = batch["type_index"]
+            position_index = batch["position_index"]
+            pad_mask = batch["pad_mask"]
+            # calling the backbone instead of the pytorch-lightning model
+            with torch.no_grad():
+                predicted_noise = self.backbone.forward(t, pcs, sentence, x_noisy, type_index, position_index, pad_mask)
+            # compute noisy x at t
+            model_mean = sqrt_recip_alphas_t * (x_noisy - betas_t * predicted_noise / sqrt_one_minus_alphas_cumprod_t)
+            if t_index == 0:
+                x_noisy = model_mean
+            else:
+                posterior_variance_t = extract(noise_schedule.posterior_variance, t, x_noisy.shape)
+                noise = torch.randn_like(x_noisy)
+                x_noisy = model_mean + torch.sqrt(posterior_variance_t) * noise
+            xs.append(x_noisy)
+        xs = list(reversed(xs))
+        return xs
+class SamplerV2:
+    def __init__(self, diffusion_model_class, diffusion_checkpoint_path,
+                 collision_model_class, collision_checkpoint_path,
+                 device, debug=False):
+        self.debug = debug
+        self.device = device
+        self.diffusion_model = diffusion_model_class.load_from_checkpoint(diffusion_checkpoint_path)
+        self.diffusion_backbone = self.diffusion_model.model
+        self.diffusion_backbone.to(device)
+        self.diffusion_backbone.eval()
+        self.collision_model = collision_model_class.load_from_checkpoint(collision_checkpoint_path)
+        self.collision_backbone = self.collision_model.model
+        self.collision_backbone.to(device)
+        self.collision_backbone.eval()
+    def sample(self, batch, num_poses):
+        noise_schedule = self.diffusion_model.noise_schedule
+        B = batch["pcs"].shape[0]
+        x_noisy = torch.randn((B, num_poses, 9), device=self.device)
+        xs = []
+        for t_index in tqdm(reversed(range(0, noise_schedule.timesteps)),
+                            desc='sampling loop time step', total=noise_schedule.timesteps):
+            t = torch.full((B,), t_index, device=self.device, dtype=torch.long)
+            # noise schedule
+            betas_t = extract(noise_schedule.betas, t, x_noisy.shape)
+            sqrt_one_minus_alphas_cumprod_t = extract(noise_schedule.sqrt_one_minus_alphas_cumprod, t, x_noisy.shape)
+            sqrt_recip_alphas_t = extract(noise_schedule.sqrt_recip_alphas, t, x_noisy.shape)
+            # predict noise
+            pcs = batch["pcs"]
+            sentence = batch["sentence"]
+            type_index = batch["type_index"]
+            position_index = batch["position_index"]
+            pad_mask = batch["pad_mask"]
+            # calling the backbone instead of the pytorch-lightning model
+            with torch.no_grad():
+                predicted_noise = self.diffusion_backbone.forward(t, pcs, sentence, x_noisy, type_index, position_index, pad_mask)
+            # compute noisy x at t
+            model_mean = sqrt_recip_alphas_t * (x_noisy - betas_t * predicted_noise / sqrt_one_minus_alphas_cumprod_t)
+            if t_index == 0:
+                x_noisy = model_mean
+            else:
+                posterior_variance_t = extract(noise_schedule.posterior_variance, t, x_noisy.shape)
+                noise = torch.randn_like(x_noisy)
+                x_noisy = model_mean + torch.sqrt(posterior_variance_t) * noise
+            xs.append(x_noisy)
+        xs = list(reversed(xs))
+        visualize = True
+        struct_pose, pc_poses_in_struct = get_struct_objs_poses(xs[0])
+        # struct_pose: B, 1, 4, 4
+        # pc_poses_in_struct: B, N, 4, 4
+        S = B
+        num_elite = 10
+        ####################################################
+        # only keep one copy
+        # N, P, 3
+        obj_xyzs = batch["pcs"][0][:, :, :3]
+        print("obj_xyzs shape", obj_xyzs.shape)
+        # 1, N
+        # object_pad_mask: padding location has 1
+        num_target_objs = num_poses
+        if self.diffusion_backbone.use_virtual_structure_frame:
+            num_target_objs -= 1
+        object_pad_mask = batch["pad_mask"][0][-num_target_objs:].unsqueeze(0)
+        target_object_inds = 1 - object_pad_mask
+        print("target_object_inds shape", target_object_inds.shape)
+        print("target_object_inds", target_object_inds)
+        N, P, _ = obj_xyzs.shape
+        print("S, N, P: {}, {}, {}".format(S, N, P))
+        ####################################################
+        # S, N, ...
+        struct_pose = struct_pose.repeat(1, N, 1, 1)  # S, N, 4, 4
+        struct_pose = struct_pose.reshape(S * N, 4, 4)  # S x N, 4, 4
+        new_obj_xyzs = obj_xyzs.repeat(S, 1, 1, 1)  # S, N, P, 3
+        current_pc_pose = torch.eye(4).repeat(S, N, 1, 1).to(self.device)  # S, N, 4, 4
+        current_pc_pose[:, :, :3, 3] = torch.mean(new_obj_xyzs, dim=2)  # S, N, 4, 4
+        current_pc_pose = current_pc_pose.reshape(S * N, 4, 4)  # S x N, 4, 4
+        # optimize xyzrpy
+        obj_params = torch.zeros((S, N, 6)).to(self.device)
+        obj_params[:, :, :3] = pc_poses_in_struct[:, :, :3, 3]
+        obj_params[:, :, 3:] = tra3d.matrix_to_euler_angles(pc_poses_in_struct[:, :, :3, :3], "XYZ")  # S, N, 6
+        #
+        # new_obj_xyzs_before_cem, goal_pc_pose_before_cem = move_pc(obj_xyzs, obj_params, struct_pose, current_pc_pose, device)
+        #
+        # if visualize:
+        #     print("visualizing rearrangements predicted by the generator")
+        #     visualize_batch_pcs(new_obj_xyzs_before_cem, S, N, P, limit_B=5)
+        ####################################################
+        # rank
+        # evaluate in batches
+        scores = torch.zeros(S).to(self.device)
+        no_intersection_scores = torch.zeros(S).to(self.device)  # the higher the better
+        num_batches = int(S / B)
+        if S % B != 0:
+            num_batches += 1
+        for b in range(num_batches):
+            if b + 1 == num_batches:
+                cur_batch_idxs_start = b * B
+                cur_batch_idxs_end = S
+            else:
+                cur_batch_idxs_start = b * B
+                cur_batch_idxs_end = (b + 1) * B
+            cur_batch_size = cur_batch_idxs_end - cur_batch_idxs_start
+            # print("current batch idxs start", cur_batch_idxs_start)
+            # print("current batch idxs end", cur_batch_idxs_end)
+            # print("size of the current batch", cur_batch_size)
+            batch_obj_params = obj_params[cur_batch_idxs_start: cur_batch_idxs_end]
+            batch_struct_pose = struct_pose[cur_batch_idxs_start * N: cur_batch_idxs_end * N]
+            batch_current_pc_pose = current_pc_pose[cur_batch_idxs_start * N:cur_batch_idxs_end * N]
+            new_obj_xyzs, _, subsampled_scene_xyz, _, obj_pair_xyzs = \
+                move_pc_and_create_scene_new(obj_xyzs, batch_obj_params, batch_struct_pose, batch_current_pc_pose,
+                                             target_object_inds, self.device,
+                                             return_scene_pts=False,
+                                             return_scene_pts_and_pc_idxs=False,
+                                             num_scene_pts=False,
+                                             normalize_pc=False,
+                                             return_pair_pc=True,
+                                             num_pair_pc_pts=self.collision_model.data_cfg.num_scene_pts,
+                                             normalize_pair_pc=self.collision_model.data_cfg.normalize_pc)
+            #######################################
+            # predict whether there are pairwise collisions
+            # if collision_score_weight > 0:
+            with torch.no_grad():
+                _, num_comb, num_pair_pc_pts, _ = obj_pair_xyzs.shape
+                # obj_pair_xyzs = obj_pair_xyzs.reshape(cur_batch_size * num_comb, num_pair_pc_pts, -1)
+                collision_logits = self.collision_backbone.forward(obj_pair_xyzs.reshape(cur_batch_size * num_comb, num_pair_pc_pts, -1))
+                collision_scores = self.collision_backbone.convert_logits(collision_logits).reshape(cur_batch_size, num_comb)  # cur_batch_size, num_comb
+                # debug
+                # for bi, this_obj_pair_xyzs in enumerate(obj_pair_xyzs):
+                #     print("batch id", bi)
+                #     for pi, obj_pair_xyz in enumerate(this_obj_pair_xyzs):
+                #         print("pair", pi)
+                #         # obj_pair_xyzs: 2 * P, 5
+                #         print("collision score", collision_scores[bi, pi])
+                #         trimesh.PointCloud(obj_pair_xyz[:, :3].cpu()).show()
+                # 1 - mean() since the collision model predicts 1 if there is a collision
+                no_intersection_scores[cur_batch_idxs_start:cur_batch_idxs_end] = 1 - torch.mean(collision_scores, dim=1)
+            if visualize:
+                print("no intersection scores", no_intersection_scores)
+            # #######################################
+            # if discriminator_score_weight > 0:
+            #     # # debug:
+            #     # print(subsampled_scene_xyz.shape)
+            #     # print(subsampled_scene_xyz[0])
+            #     # trimesh.PointCloud(subsampled_scene_xyz[0, :, :3].cpu().numpy()).show()
+            #     #
+            #     with torch.no_grad():
+            #
+            #         # Important: since this discriminator only uses local structure param, takes sentence from the first and last position
+            #         # local_sentence = sentence[:, [0, 4]]
+            #         # local_sentence_pad_mask = sentence_pad_mask[:, [0, 4]]
+            #         # sentence_disc, sentence_pad_mask_disc, position_index_dic = discriminator_inference.dataset.tensorfy_sentence(raw_sentence_discriminator, raw_sentence_pad_mask_discriminator, raw_position_index_discriminator)
+            #
+            #         sentence_disc = torch.LongTensor(
+            #             [discriminator_tokenizer.tokenize(*i) for i in raw_sentence_discriminator])
+            #         sentence_pad_mask_disc = torch.LongTensor(raw_sentence_pad_mask_discriminator)
+            #         position_index_dic = torch.LongTensor(raw_position_index_discriminator)
+            #
+            #         preds = discriminator_model.forward(subsampled_scene_xyz,
+            #                                             sentence_disc.unsqueeze(0).repeat(cur_batch_size, 1).to(device),
+            #                                             sentence_pad_mask_disc.unsqueeze(0).repeat(cur_batch_size,
+            #                                                                                        1).to(device),
+            #                                             position_index_dic.unsqueeze(0).repeat(cur_batch_size, 1).to(
+            #                                                 device))
+            #         # preds = discriminator_model.forward(subsampled_scene_xyz)
+            #         preds = discriminator_model.convert_logits(preds)
+            #         preds = preds["is_circle"]  # cur_batch_size,
+            #         scores[cur_batch_idxs_start:cur_batch_idxs_end] = preds
+            #     if visualize:
+            #         print("discriminator scores", scores)
+        # scores = scores * discriminator_score_weight + no_intersection_scores * collision_score_weight
+        scores = no_intersection_scores
+        sort_idx = torch.argsort(scores).flip(dims=[0])[:num_elite]
+        elite_obj_params = obj_params[sort_idx]  # num_elite, N, 6
+        elite_struct_poses = struct_pose.reshape(S, N, 4, 4)[sort_idx]  # num_elite, N, 4, 4
+        elite_struct_poses = elite_struct_poses.reshape(num_elite * N, 4, 4)  # num_elite x N, 4, 4
+        elite_scores = scores[sort_idx]
+        print("elite scores:", elite_scores)
+        ####################################################
+        # # visualize best samples
+        # num_scene_pts = 4096 # if discriminator_num_scene_pts is None else discriminator_num_scene_pts
+        # batch_current_pc_pose = current_pc_pose[0: num_elite * N]
+        # best_new_obj_xyzs, best_goal_pc_pose, best_subsampled_scene_xyz, _, _ = \
+        #     move_pc_and_create_scene_new(obj_xyzs, elite_obj_params, elite_struct_poses, batch_current_pc_pose,
+        #                                  target_object_inds, self.device,
+        #                                  return_scene_pts=True, num_scene_pts=num_scene_pts, normalize_pc=True)
+        # if visualize:
+        #     print("visualizing elite rearrangements ranked by collision model/discriminator")
+        #     visualize_batch_pcs(best_new_obj_xyzs, num_elite, limit_B=num_elite)
+        # num_elite, N, 6
+        elite_obj_params = elite_obj_params.reshape(num_elite * N, -1)
+        pc_poses_in_struct = torch.eye(4).repeat(num_elite * N, 1, 1).to(self.device)
+        pc_poses_in_struct[:, :3, :3] = tra3d.euler_angles_to_matrix(elite_obj_params[:, 3:], "XYZ")
+        pc_poses_in_struct[:, :3, 3] = elite_obj_params[:, :3]
+        pc_poses_in_struct = pc_poses_in_struct.reshape(num_elite, N, 4, 4)  # num_elite, N, 4, 4
+        struct_pose = elite_struct_poses.reshape(num_elite, N, 4, 4)[:, 0,].unsqueeze(1)  # num_elite, 1, 4, 4
+        return struct_pose, pc_poses_in_struct

src/StructDiffusion/language/__init__.py ADDED Viewed

File without changes

src/StructDiffusion/language/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (180 Bytes). View file

src/StructDiffusion/language/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (184 Bytes). View file

src/StructDiffusion/language/__pycache__/tokenizer.cpython-37.pyc ADDED Viewed

Binary file (11.4 kB). View file