import copy import cv2 import h5py import numpy as np import os import trimesh import torch from tqdm import tqdm import json import random from torch.utils.data import DataLoader # Local imports from StructDiffusion.utils.rearrangement import show_pcs, get_pts, combine_and_sample_xyzs from StructDiffusion.language.tokenizer import Tokenizer import StructDiffusion.utils.brain2.camera as cam import StructDiffusion.utils.brain2.image as img import StructDiffusion.utils.transformations as tra class SemanticArrangementDataset(torch.utils.data.Dataset): def __init__(self, data_roots, index_roots, split, tokenizer, max_num_target_objects=11, max_num_distractor_objects=5, max_num_shape_parameters=7, max_num_rearrange_features=1, max_num_anchor_features=3, num_pts=1024, use_virtual_structure_frame=True, ignore_distractor_objects=True, ignore_rgb=True, filter_num_moved_objects_range=None, shuffle_object_index=False, data_augmentation=True, debug=False, **kwargs): """ Note: setting filter_num_moved_objects_range=[k, k] and max_num_objects=k will create no padding for target objs :param data_root: :param split: train, valid, or test :param shuffle_object_index: whether to shuffle the positions of target objects and other objects in the sequence :param debug: :param max_num_shape_parameters: :param max_num_objects: :param max_num_rearrange_features: :param max_num_anchor_features: :param num_pts: :param use_stored_arrangement_indices: :param kwargs: """ self.use_virtual_structure_frame = use_virtual_structure_frame self.ignore_distractor_objects = ignore_distractor_objects self.ignore_rgb = ignore_rgb and not debug self.num_pts = num_pts self.debug = debug self.max_num_objects = max_num_target_objects self.max_num_other_objects = max_num_distractor_objects self.max_num_shape_parameters = max_num_shape_parameters self.max_num_rearrange_features = max_num_rearrange_features self.max_num_anchor_features = max_num_anchor_features self.shuffle_object_index = shuffle_object_index # used to tokenize the language part self.tokenizer = tokenizer # retrieve data self.data_roots = data_roots self.arrangement_data = [] arrangement_steps = [] for ddx in range(len(data_roots)): data_root = data_roots[ddx] index_root = index_roots[ddx] arrangement_indices_file = os.path.join(data_root, index_root, "{}_arrangement_indices_file_all.txt".format(split)) if os.path.exists(arrangement_indices_file): with open(arrangement_indices_file, "r") as fh: arrangement_steps.extend([(os.path.join(data_root, f[0]), f[1]) for f in eval(fh.readline().strip())]) else: print("{} does not exist".format(arrangement_indices_file)) # only keep the goal, ignore the intermediate steps for filename, step_t in arrangement_steps: if step_t == 0: if "data00026058" in filename or "data00011415" in filename or "data00026061" in filename or "data00700565" in filename: continue self.arrangement_data.append((filename, step_t)) # if specified, filter data if filter_num_moved_objects_range is not None: self.arrangement_data = self.filter_based_on_number_of_moved_objects(filter_num_moved_objects_range) print("{} valid sequences".format(len(self.arrangement_data))) # Data Aug self.data_augmentation = data_augmentation # additive noise self.gp_rescale_factor_range = [12, 20] self.gaussian_scale_range = [0., 0.003] # multiplicative noise self.gamma_shape = 1000. self.gamma_scale = 0.001 def filter_based_on_number_of_moved_objects(self, filter_num_moved_objects_range): assert len(list(filter_num_moved_objects_range)) == 2 min_num, max_num = filter_num_moved_objects_range print("Remove scenes that have less than {} or more than {} objects being moved".format(min_num, max_num)) ok_data = [] for filename, step_t in self.arrangement_data: h5 = h5py.File(filename, 'r') moved_objs = h5['moved_objs'][()].split(',') if min_num <= len(moved_objs) <= max_num: ok_data.append((filename, step_t)) print("{} valid sequences left".format(len(ok_data))) return ok_data def get_data_idx(self, idx): # Create the datum to return file_idx = np.argmax(idx < self.file_to_count) data = h5py.File(self.data_files[file_idx], 'r') if file_idx > 0: # for lang2sym, idx is always 0 idx = idx - self.file_to_count[file_idx - 1] return data, idx, file_idx def add_noise_to_depth(self, depth_img): """ add depth noise """ multiplicative_noise = np.random.gamma(self.gamma_shape, self.gamma_scale) depth_img = multiplicative_noise * depth_img return depth_img def add_noise_to_xyz(self, xyz_img, depth_img): """ TODO: remove this code or at least celean it up""" xyz_img = xyz_img.copy() H, W, C = xyz_img.shape gp_rescale_factor = np.random.randint(self.gp_rescale_factor_range[0], self.gp_rescale_factor_range[1]) gp_scale = np.random.uniform(self.gaussian_scale_range[0], self.gaussian_scale_range[1]) small_H, small_W = (np.array([H, W]) / gp_rescale_factor).astype(int) additive_noise = np.random.normal(loc=0.0, scale=gp_scale, size=(small_H, small_W, C)) additive_noise = cv2.resize(additive_noise, (W, H), interpolation=cv2.INTER_CUBIC) xyz_img[depth_img > 0, :] += additive_noise[depth_img > 0, :] return xyz_img def random_index(self): return self[np.random.randint(len(self))] def _get_rgb(self, h5, idx, ee=True): RGB = "ee_rgb" if ee else "rgb" rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255. # remove alpha return rgb1 def _get_depth(self, h5, idx, ee=True): DEPTH = "ee_depth" if ee else "depth" def _get_images(self, h5, idx, ee=True): if ee: RGB, DEPTH, SEG = "ee_rgb", "ee_depth", "ee_seg" DMIN, DMAX = "ee_depth_min", "ee_depth_max" else: RGB, DEPTH, SEG = "rgb", "depth", "seg" DMIN, DMAX = "depth_min", "depth_max" dmin = h5[DMIN][idx] dmax = h5[DMAX][idx] rgb1 = img.PNGToNumpy(h5[RGB][idx])[:, :, :3] / 255. # remove alpha depth1 = h5[DEPTH][idx] / 20000. * (dmax - dmin) + dmin seg1 = img.PNGToNumpy(h5[SEG][idx]) valid1 = np.logical_and(depth1 > 0.1, depth1 < 2.) # proj_matrix = h5['proj_matrix'][()] camera = cam.get_camera_from_h5(h5) if self.data_augmentation: depth1 = self.add_noise_to_depth(depth1) xyz1 = cam.compute_xyz(depth1, camera) if self.data_augmentation: xyz1 = self.add_noise_to_xyz(xyz1, depth1) # Transform the point cloud # Here it is... # CAM_POSE = "ee_cam_pose" if ee else "cam_pose" CAM_POSE = "ee_camera_view" if ee else "camera_view" cam_pose = h5[CAM_POSE][idx] if ee: # ee_camera_view has 0s for x, y, z cam_pos = h5["ee_cam_pose"][:][:3, 3] cam_pose[:3, 3] = cam_pos # Get transformed point cloud h, w, d = xyz1.shape xyz1 = xyz1.reshape(h * w, -1) xyz1 = trimesh.transform_points(xyz1, cam_pose) xyz1 = xyz1.reshape(h, w, -1) scene1 = rgb1, depth1, seg1, valid1, xyz1 return scene1 def __len__(self): return len(self.arrangement_data) def _get_ids(self, h5): """ get object ids @param h5: @return: """ ids = {} for k in h5.keys(): if k.startswith("id_"): ids[k[3:]] = h5[k][()] return ids def get_positive_ratio(self): num_pos = 0 for d in self.arrangement_data: filename, step_t = d if step_t == 0: num_pos += 1 return (len(self.arrangement_data) - num_pos) * 1.0 / num_pos def get_object_position_vocab_sizes(self): return self.tokenizer.get_object_position_vocab_sizes() def get_vocab_size(self): return self.tokenizer.get_vocab_size() def get_data_index(self, idx): filename = self.arrangement_data[idx] return filename def get_raw_data(self, idx, inference_mode=False, shuffle_object_index=False): """ :param idx: :param inference_mode: :param shuffle_object_index: used to test different orders of objects :return: """ filename, _ = self.arrangement_data[idx] h5 = h5py.File(filename, 'r') ids = self._get_ids(h5) all_objs = sorted([o for o in ids.keys() if "object_" in o]) goal_specification = json.loads(str(np.array(h5["goal_specification"]))) num_rearrange_objs = len(goal_specification["rearrange"]["objects"]) num_other_objs = len(goal_specification["anchor"]["objects"] + goal_specification["distract"]["objects"]) assert len(all_objs) == num_rearrange_objs + num_other_objs, "{}, {}".format(len(all_objs), num_rearrange_objs + num_other_objs) assert num_rearrange_objs <= self.max_num_objects assert num_other_objs <= self.max_num_other_objects # important: only using the last step step_t = num_rearrange_objs target_objs = all_objs[:num_rearrange_objs] other_objs = all_objs[num_rearrange_objs:] structure_parameters = goal_specification["shape"] # Important: ensure the order is correct if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line": target_objs = target_objs[::-1] elif structure_parameters["type"] == "tower" or structure_parameters["type"] == "dinner": target_objs = target_objs else: raise KeyError("{} structure is not recognized".format(structure_parameters["type"])) all_objs = target_objs + other_objs ################################### # getting scene images and point clouds scene = self._get_images(h5, step_t, ee=True) rgb, depth, seg, valid, xyz = scene if inference_mode: initial_scene = scene # getting object point clouds obj_pcs = [] obj_pad_mask = [] current_pc_poses = [] other_obj_pcs = [] other_obj_pad_mask = [] for obj in all_objs: obj_mask = np.logical_and(seg == ids[obj], valid) if np.sum(obj_mask) <= 0: raise Exception ok, obj_xyz, obj_rgb, _ = get_pts(xyz, rgb, obj_mask, num_pts=self.num_pts) if not ok: raise Exception if obj in target_objs: if self.ignore_rgb: obj_pcs.append(obj_xyz) else: obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1)) obj_pad_mask.append(0) pc_pose = np.eye(4) pc_pose[:3, 3] = torch.mean(obj_xyz, dim=0).numpy() current_pc_poses.append(pc_pose) elif obj in other_objs: if self.ignore_rgb: other_obj_pcs.append(obj_xyz) else: other_obj_pcs.append(torch.concat([obj_xyz, obj_rgb], dim=-1)) other_obj_pad_mask.append(0) else: raise Exception ################################### # computes goal positions for objects # Important: because of the noises we added to point clouds, the rearranged point clouds will not be perfect if self.use_virtual_structure_frame: goal_structure_pose = tra.euler_matrix(structure_parameters["rotation"][0], structure_parameters["rotation"][1], structure_parameters["rotation"][2]) goal_structure_pose[:3, 3] = [structure_parameters["position"][0], structure_parameters["position"][1], structure_parameters["position"][2]] goal_structure_pose_inv = np.linalg.inv(goal_structure_pose) goal_obj_poses = [] current_obj_poses = [] goal_pc_poses = [] for obj, current_pc_pose in zip(target_objs, current_pc_poses): goal_pose = h5[obj][0] current_pose = h5[obj][step_t] if inference_mode: goal_obj_poses.append(goal_pose) current_obj_poses.append(current_pose) goal_pc_pose = goal_pose @ np.linalg.inv(current_pose) @ current_pc_pose if self.use_virtual_structure_frame: goal_pc_pose = goal_structure_pose_inv @ goal_pc_pose goal_pc_poses.append(goal_pc_pose) # transform current object point cloud to the goal point cloud in the world frame if self.debug: new_obj_pcs = [copy.deepcopy(pc.numpy()) for pc in obj_pcs] for i, obj_pc in enumerate(new_obj_pcs): current_pc_pose = current_pc_poses[i] goal_pc_pose = goal_pc_poses[i] if self.use_virtual_structure_frame: goal_pc_pose = goal_structure_pose @ goal_pc_pose print("current pc pose", current_pc_pose) print("goal pc pose", goal_pc_pose) goal_pc_transform = goal_pc_pose @ np.linalg.inv(current_pc_pose) print("transform", goal_pc_transform) new_obj_pc = copy.deepcopy(obj_pc) new_obj_pc[:, :3] = trimesh.transform_points(obj_pc[:, :3], goal_pc_transform) print(new_obj_pc.shape) # visualize rearrangement sequence (new_obj_xyzs), the current object before moving (obj_xyz), and other objects new_obj_pcs[i] = new_obj_pc new_obj_pcs[i][:, 3:] = np.tile(np.array([1, 0, 0], dtype=np.float), (new_obj_pc.shape[0], 1)) new_obj_rgb_current = np.tile(np.array([0, 1, 0], dtype=np.float), (new_obj_pc.shape[0], 1)) show_pcs([pc[:, :3] for pc in new_obj_pcs] + [pc[:, :3] for pc in other_obj_pcs] + [obj_pc[:, :3]], [pc[:, 3:] for pc in new_obj_pcs] + [pc[:, 3:] for pc in other_obj_pcs] + [new_obj_rgb_current], add_coordinate_frame=True) show_pcs([pc[:, :3] for pc in new_obj_pcs], [pc[:, 3:] for pc in new_obj_pcs], add_coordinate_frame=True) # pad data for i in range(self.max_num_objects - len(target_objs)): obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32)) obj_pad_mask.append(1) for i in range(self.max_num_other_objects - len(other_objs)): other_obj_pcs.append(torch.zeros_like(obj_pcs[0], dtype=torch.float32)) other_obj_pad_mask.append(1) ################################### # preparing sentence sentence = [] sentence_pad_mask = [] # structure parameters # 5 parameters structure_parameters = goal_specification["shape"] if structure_parameters["type"] == "circle" or structure_parameters["type"] == "line": sentence.append((structure_parameters["type"], "shape")) sentence.append((structure_parameters["rotation"][2], "rotation")) sentence.append((structure_parameters["position"][0], "position_x")) sentence.append((structure_parameters["position"][1], "position_y")) if structure_parameters["type"] == "circle": sentence.append((structure_parameters["radius"], "radius")) elif structure_parameters["type"] == "line": sentence.append((structure_parameters["length"] / 2.0, "radius")) for _ in range(5): sentence_pad_mask.append(0) else: sentence.append((structure_parameters["type"], "shape")) sentence.append((structure_parameters["rotation"][2], "rotation")) sentence.append((structure_parameters["position"][0], "position_x")) sentence.append((structure_parameters["position"][1], "position_y")) for _ in range(4): sentence_pad_mask.append(0) sentence.append(("PAD", None)) sentence_pad_mask.append(1) ################################### # paddings for i in range(self.max_num_objects - len(target_objs)): goal_pc_poses.append(np.eye(4)) ################################### if self.debug: print("---") print("all objects:", all_objs) print("target objects:", target_objs) print("other objects:", other_objs) print("goal specification:", goal_specification) print("sentence:", sentence) show_pcs([pc[:, :3] for pc in obj_pcs + other_obj_pcs], [pc[:, 3:] for pc in obj_pcs + other_obj_pcs], add_coordinate_frame=True) assert len(obj_pcs) == len(goal_pc_poses) ################################### # shuffle the position of objects if shuffle_object_index: shuffle_target_object_indices = list(range(len(target_objs))) random.shuffle(shuffle_target_object_indices) shuffle_object_indices = shuffle_target_object_indices + list(range(len(target_objs), self.max_num_objects)) obj_pcs = [obj_pcs[i] for i in shuffle_object_indices] goal_pc_poses = [goal_pc_poses[i] for i in shuffle_object_indices] if inference_mode: goal_obj_poses = [goal_obj_poses[i] for i in shuffle_object_indices] current_obj_poses = [current_obj_poses[i] for i in shuffle_object_indices] target_objs = [target_objs[i] for i in shuffle_target_object_indices] current_pc_poses = [current_pc_poses[i] for i in shuffle_object_indices] ################################### if self.use_virtual_structure_frame: if self.ignore_distractor_objects: # language, structure virtual frame, target objects pcs = obj_pcs type_index = [0] * self.max_num_shape_parameters + [2] + [3] * self.max_num_objects position_index = list(range(self.max_num_shape_parameters)) + [0] + list(range(self.max_num_objects)) pad_mask = sentence_pad_mask + [0] + obj_pad_mask else: # language, distractor objects, structure virtual frame, target objects pcs = other_obj_pcs + obj_pcs type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [2] + [3] * self.max_num_objects position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + [0] + list(range(self.max_num_objects)) pad_mask = sentence_pad_mask + other_obj_pad_mask + [0] + obj_pad_mask goal_poses = [goal_structure_pose] + goal_pc_poses else: if self.ignore_distractor_objects: # language, target objects pcs = obj_pcs type_index = [0] * self.max_num_shape_parameters + [3] * self.max_num_objects position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_objects)) pad_mask = sentence_pad_mask + obj_pad_mask else: # language, distractor objects, target objects pcs = other_obj_pcs + obj_pcs type_index = [0] * self.max_num_shape_parameters + [1] * self.max_num_other_objects + [3] * self.max_num_objects position_index = list(range(self.max_num_shape_parameters)) + list(range(self.max_num_other_objects)) + list(range(self.max_num_objects)) pad_mask = sentence_pad_mask + other_obj_pad_mask + obj_pad_mask goal_poses = goal_pc_poses datum = { "pcs": pcs, "sentence": sentence, "goal_poses": goal_poses, "type_index": type_index, "position_index": position_index, "pad_mask": pad_mask, "t": step_t, "filename": filename } if inference_mode: datum["rgb"] = rgb datum["goal_obj_poses"] = goal_obj_poses datum["current_obj_poses"] = current_obj_poses datum["target_objs"] = target_objs datum["initial_scene"] = initial_scene datum["ids"] = ids datum["goal_specification"] = goal_specification datum["current_pc_poses"] = current_pc_poses return datum @staticmethod def convert_to_tensors(datum, tokenizer): tensors = { "pcs": torch.stack(datum["pcs"], dim=0), "sentence": torch.LongTensor(np.array([tokenizer.tokenize(*i) for i in datum["sentence"]])), "goal_poses": torch.FloatTensor(np.array(datum["goal_poses"])), "type_index": torch.LongTensor(np.array(datum["type_index"])), "position_index": torch.LongTensor(np.array(datum["position_index"])), "pad_mask": torch.LongTensor(np.array(datum["pad_mask"])), "t": datum["t"], "filename": datum["filename"] } return tensors def __getitem__(self, idx): datum = self.convert_to_tensors(self.get_raw_data(idx, shuffle_object_index=self.shuffle_object_index), self.tokenizer) return datum def single_datum_to_batch(self, x, num_samples, device, inference_mode=True): tensor_x = {} tensor_x["pcs"] = x["pcs"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1) tensor_x["sentence"] = x["sentence"].to(device)[None, :].repeat(num_samples, 1) if not inference_mode: tensor_x["goal_poses"] = x["goal_poses"].to(device)[None, :, :, :].repeat(num_samples, 1, 1, 1) tensor_x["type_index"] = x["type_index"].to(device)[None, :].repeat(num_samples, 1) tensor_x["position_index"] = x["position_index"].to(device)[None, :].repeat(num_samples, 1) tensor_x["pad_mask"] = x["pad_mask"].to(device)[None, :].repeat(num_samples, 1) return tensor_x def compute_min_max(dataloader): # tensor([-0.3557, -0.3847, 0.0000, -1.0000, -1.0000, -0.4759, -1.0000, -1.0000, # -0.9079, -0.8668, -0.9105, -0.4186]) # tensor([0.3915, 0.3494, 0.3267, 1.0000, 1.0000, 0.8961, 1.0000, 1.0000, 0.8194, # 0.4787, 0.6421, 1.0000]) # tensor([0.0918, -0.3758, 0.0000, -1.0000, -1.0000, 0.0000, -1.0000, -1.0000, # -0.0000, 0.0000, 0.0000, 1.0000]) # tensor([0.9199, 0.3710, 0.0000, 1.0000, 1.0000, 0.0000, 1.0000, 1.0000, -0.0000, # 0.0000, 0.0000, 1.0000]) min_value = torch.ones(16) * 10000 max_value = torch.ones(16) * -10000 for d in tqdm(dataloader): goal_poses = d["goal_poses"] goal_poses = goal_poses.reshape(-1, 16) current_max, _ = torch.max(goal_poses, dim=0) current_min, _ = torch.min(goal_poses, dim=0) max_value[max_value < current_max] = current_max[max_value < current_max] max_value[max_value > current_min] = current_min[max_value > current_min] print(f"{min_value} - {max_value}") if __name__ == "__main__": tokenizer = Tokenizer("/home/weiyu/data_drive/data_new_objects/type_vocabs_coarse.json") data_roots = [] index_roots = [] for shape, index in [("circle", "index_10k"), ("line", "index_10k"), ("stacking", "index_10k"), ("dinner", "index_10k")]: data_roots.append("/home/weiyu/data_drive/data_new_objects/examples_{}_new_objects/result".format(shape)) index_roots.append(index) dataset = SemanticArrangementDataset(data_roots=data_roots, index_roots=index_roots, split="valid", tokenizer=tokenizer, max_num_target_objects=7, max_num_distractor_objects=5, max_num_shape_parameters=5, max_num_rearrange_features=0, max_num_anchor_features=0, num_pts=1024, use_virtual_structure_frame=True, ignore_distractor_objects=True, ignore_rgb=True, filter_num_moved_objects_range=None, # [5, 5] data_augmentation=False, shuffle_object_index=False, debug=False) # print(len(dataset)) # for d in dataset: # print("\n\n" + "="*100) dataloader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=8) for i, d in enumerate(tqdm(dataloader)): pass # for k in d: # if isinstance(d[k], torch.Tensor): # print("--size", k, d[k].shape) # for k in d: # print(k, d[k]) # # input("next?")