import os import numpy as np import cv2 import torch from .base_data import BaseDataset from .behave_paths import DataPaths from .img_utils import compute_translation, masks2bbox, crop def padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio=0.75): """ pad images to have 4:3 aspect ratio :param rgb: (H, W, 3) :param person_mask: :param obj_mask: :return: all images at the given aspect ratio """ h, w = rgb.shape[:2] if w > h * 1/aspect_ratio: # pad top h_4x3 = int(w * aspect_ratio) pad_top = h_4x3 - h rgb_pad = np.pad(rgb, ((pad_top, 0), (0, 0), (0, 0))) person_mask = np.pad(person_mask, ((pad_top, 0), (0, 0))) if person_mask is not None else None obj_mask = np.pad(obj_mask, ((pad_top, 0), (0, 0))) if obj_mask is not None else None else: # pad two side w_new = np.lcm.reduce([h * 2, 16]) # least common multiplier h_4x3 = int(w_new * aspect_ratio) pad_top = h_4x3 - h pad_left = (w_new - w) // 2 pad_right = w_new - w - pad_left rgb_pad = np.pad(rgb, ((pad_top, 0), (pad_left, pad_right), (0, 0))) obj_mask = np.pad(obj_mask, ((pad_top, 0), (pad_left, pad_right))) if obj_mask is not None else None person_mask = np.pad(person_mask, ((pad_top, 0), (pad_left, pad_right))) if person_mask is not None else None return rgb_pad, obj_mask, person_mask def recrop_input(rgb, person_mask, obj_mask, dataset_name='behave'): "recrop input images" exp_ratio = 1.42 if dataset_name == 'behave': mean_center = np.array([1008, 995]) # mean RGB image crop center behave_size = (2048, 1536) new_size = (int(750 * exp_ratio), int(exp_ratio * 750)) else: mean_center = np.array([904, 668]) # mean RGB image crop center for bottle sequences of ICAP behave_size = (1920, 1080) new_size = (int(593.925 * exp_ratio), int(exp_ratio * 593.925)) # mean width of bottle sequences aspect_ratio = behave_size[1] / behave_size[0] pad_top = mean_center[1] - new_size[0] // 2 pad_bottom = behave_size[1] - (mean_center[1] + new_size[0] // 2) pad_left = mean_center[0] - new_size[0] // 2 pad_right = behave_size[0] - (mean_center[0] + new_size[0] // 2) # First resize to the same aspect ratio if rgb.shape[0] / rgb.shape[1] != aspect_ratio: rgb, obj_mask, person_mask = padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio) # Resize to the same size as behave image, to have a comparable pixel size rgb = cv2.resize(rgb, behave_size) mask_ps = cv2.resize(person_mask, behave_size) mask_obj = cv2.resize(obj_mask, behave_size) # Crop and resize the human + object patch bmin, bmax = masks2bbox([mask_ps, mask_obj]) center = (bmin + bmax) // 2 crop_size = int(np.max(bmax - bmin) * exp_ratio) # larger crop to have background img_crop = cv2.resize(crop(rgb, center, crop_size), new_size) mask_ps = cv2.resize(crop(mask_ps, center, crop_size), new_size) mask_obj = cv2.resize(crop(mask_obj, center, crop_size), new_size) # Pad back to have same shape as behave image img_full = np.pad(img_crop, [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]) mask_ps_full = np.pad(mask_ps, [[pad_top, pad_bottom], [pad_left, pad_right]]) mask_obj_full = np.pad(mask_obj, [[pad_top, pad_bottom], [pad_left, pad_right]]) # Make sure the image shape is the same if img_full.shape[:2] != behave_size[::-1]: img_full = cv2.resize(img_full, behave_size) mask_ps_full = cv2.resize(mask_ps_full, behave_size) mask_obj_full = cv2.resize(mask_obj_full, behave_size) return img_full, mask_ps_full, mask_obj_full class DemoDataset(BaseDataset): def __init__(self, data_paths, input_size=(224, 224), std_coverage=3.5, # used to estimate camera translation ): super().__init__(data_paths, input_size) self.std_coverage = std_coverage def __len__(self): return len(self.data_paths) def __getitem__(self, idx): rgb_file = self.data_paths[idx] mask_hum, mask_obj = self.load_masks(rgb_file) rgb_full = cv2.imread(rgb_file)[:, :, ::-1] return self.image2dict(mask_hum, mask_obj, rgb_full, rgb_file) def image2dict(self, mask_hum, mask_obj, rgb_full, rgb_file=None): "do all the necessary preprocessing for images" if rgb_full.shape[:2] != mask_obj.shape[:2]: raise ValueError(f"The given object mask shape {mask_obj.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}") if rgb_full.shape[:2] != mask_hum.shape[:2]: raise ValueError(f"The given human mask shape {mask_hum.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}") if rgb_full.shape[:2] not in [(1080, 1920), (1536, 2048)]: # crop and resize the image to behave image size print(f"Recropping the input image and masks for {rgb_file}") rgb_full, mask_hum, mask_obj = recrop_input(rgb_full, mask_hum, mask_obj) color_h, color_w = rgb_full.shape[:2] # Input to the first stage model: human + object crop Kroi, objmask_fullcrop, psmask_fullcrop, rgb_fullcrop = self.crop_full_image(mask_hum.copy(), mask_obj.copy(), rgb_full.copy(), [mask_hum, mask_obj], 1.00) # Input to the second stage model: human and object crops Kroi_h, masko_hum, maskh_hum, rgb_hum = self.crop_full_image(mask_hum.copy(), mask_obj.copy(), rgb_full.copy(), [mask_hum, mask_hum], 1.05) Kroi_o, masko_obj, maskh_obj, rgb_obj = self.crop_full_image(mask_hum.copy(), mask_obj.copy(), rgb_full.copy(), [mask_obj, mask_obj], 1.5) # Estimate camera translation cent_transform = np.eye(4) # the transform applied to the mesh that moves it back to kinect camera frame bmin_ho, bmax_ho = masks2bbox([mask_hum, mask_obj]) crop_size_ho = int(np.max(bmax_ho - bmin_ho) * 1.0) if crop_size_ho % 2 == 1: crop_size_ho += 1 # make sure it is an even number is_behave = self.is_behave_dataset(rgb_full.shape[1]) if rgb_full.shape[1] not in [2048, 1920]: raise ValueError('the image is not normalized to BEHAVE or ICAP size!') indices = np.indices(rgb_full.shape[:2]) if np.sum(mask_obj > 127) < 5: raise ValueError(f'not enough object mask found for {rgb_file}') pts_h = np.stack([indices[1][mask_hum > 127], indices[0][mask_hum > 127]], -1) pts_o = np.stack([indices[1][mask_obj > 127], indices[0][mask_obj > 127]], -1) proj_cent_est = (np.mean(pts_h, 0) + np.mean(pts_o, 0)) / 2. # heuristic to obtain 2d projection center transl_estimate = compute_translation(proj_cent_est, crop_size_ho, is_behave, self.std_coverage) cent_transform[:3, 3] = transl_estimate / 7.0 radius = 0.5 # don't do normalization anymore cent = transl_estimate / 7.0 comb = np.matmul(self.opencv2py3d, cent_transform) R = torch.from_numpy(comb[:3, :3]).float() T = torch.from_numpy(comb[:3, 3]).float() / (radius * 2) data_dict = { "R": R, "T": T, "K": torch.from_numpy(Kroi).float(), "T_ho": torch.from_numpy(cent).float(), # translation for H+O "image_path": rgb_file, "image_size_hw": torch.tensor(self.input_size), "images": torch.from_numpy(rgb_fullcrop).float().permute(2, 0, 1), "masks": torch.from_numpy(np.stack([psmask_fullcrop, objmask_fullcrop], 0)).float(), 'orig_image_size': torch.tensor([color_h, color_w]), # Human input to stage 2 "images_hum": torch.from_numpy(rgb_hum).float().permute(2, 0, 1), "masks_hum": torch.from_numpy(np.stack([maskh_hum, masko_hum], 0)).float(), "K_hum": torch.from_numpy(Kroi_h).float(), # Object input to stage 2 "images_obj": torch.from_numpy(rgb_obj).float().permute(2, 0, 1), "masks_obj": torch.from_numpy(np.stack([maskh_obj, masko_obj], 0)).float(), "K_obj": torch.from_numpy(Kroi_o).float(), # some normalization parameters "gt_trans": cent, 'radius': radius, "estimated_trans": transl_estimate, } return data_dict def image2batch(self, rgb, mask_hum, mask_obj): """ given input image, convert it into a batch object ready for model inference :param rgb: (h, w, 3), np array :param mask_hum: (h, w, 3), np array :param mask_obj: (h, w, 3), np array :return: """ mask_hum = np.mean(mask_hum, -1) mask_obj = np.mean(mask_obj, -1) data_dict = self.image2dict(mask_hum, mask_obj, rgb, 'input image') # convert dict to list new_dict = {k:[v] for k, v in data_dict.items()} return new_dict