HDM-interaction-recon / dataset /demo_dataset.py
xiexh20's picture
add hdm demo v1
2fd6166
import os
import numpy as np
import cv2
import torch
from .base_data import BaseDataset
from .behave_paths import DataPaths
from .img_utils import compute_translation, masks2bbox, crop
def padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio=0.75):
"""
pad images to have 4:3 aspect ratio
:param rgb: (H, W, 3)
:param person_mask:
:param obj_mask:
:return: all images at the given aspect ratio
"""
h, w = rgb.shape[:2]
if w > h * 1/aspect_ratio:
# pad top
h_4x3 = int(w * aspect_ratio)
pad_top = h_4x3 - h
rgb_pad = np.pad(rgb, ((pad_top, 0), (0, 0), (0, 0)))
person_mask = np.pad(person_mask, ((pad_top, 0), (0, 0))) if person_mask is not None else None
obj_mask = np.pad(obj_mask, ((pad_top, 0), (0, 0))) if obj_mask is not None else None
else:
# pad two side
w_new = np.lcm.reduce([h * 2, 16]) # least common multiplier
h_4x3 = int(w_new * aspect_ratio)
pad_top = h_4x3 - h
pad_left = (w_new - w) // 2
pad_right = w_new - w - pad_left
rgb_pad = np.pad(rgb, ((pad_top, 0), (pad_left, pad_right), (0, 0)))
obj_mask = np.pad(obj_mask, ((pad_top, 0), (pad_left, pad_right))) if obj_mask is not None else None
person_mask = np.pad(person_mask, ((pad_top, 0), (pad_left, pad_right))) if person_mask is not None else None
return rgb_pad, obj_mask, person_mask
def recrop_input(rgb, person_mask, obj_mask, dataset_name='behave'):
"recrop input images"
exp_ratio = 1.42
if dataset_name == 'behave':
mean_center = np.array([1008, 995]) # mean RGB image crop center
behave_size = (2048, 1536)
new_size = (int(750 * exp_ratio), int(exp_ratio * 750))
else:
mean_center = np.array([904, 668]) # mean RGB image crop center for bottle sequences of ICAP
behave_size = (1920, 1080)
new_size = (int(593.925 * exp_ratio), int(exp_ratio * 593.925)) # mean width of bottle sequences
aspect_ratio = behave_size[1] / behave_size[0]
pad_top = mean_center[1] - new_size[0] // 2
pad_bottom = behave_size[1] - (mean_center[1] + new_size[0] // 2)
pad_left = mean_center[0] - new_size[0] // 2
pad_right = behave_size[0] - (mean_center[0] + new_size[0] // 2)
# First resize to the same aspect ratio
if rgb.shape[0] / rgb.shape[1] != aspect_ratio:
rgb, obj_mask, person_mask = padTo_4x3(rgb, person_mask, obj_mask, aspect_ratio)
# Resize to the same size as behave image, to have a comparable pixel size
rgb = cv2.resize(rgb, behave_size)
mask_ps = cv2.resize(person_mask, behave_size)
mask_obj = cv2.resize(obj_mask, behave_size)
# Crop and resize the human + object patch
bmin, bmax = masks2bbox([mask_ps, mask_obj])
center = (bmin + bmax) // 2
crop_size = int(np.max(bmax - bmin) * exp_ratio) # larger crop to have background
img_crop = cv2.resize(crop(rgb, center, crop_size), new_size)
mask_ps = cv2.resize(crop(mask_ps, center, crop_size), new_size)
mask_obj = cv2.resize(crop(mask_obj, center, crop_size), new_size)
# Pad back to have same shape as behave image
img_full = np.pad(img_crop, [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]])
mask_ps_full = np.pad(mask_ps, [[pad_top, pad_bottom], [pad_left, pad_right]])
mask_obj_full = np.pad(mask_obj, [[pad_top, pad_bottom], [pad_left, pad_right]])
# Make sure the image shape is the same
if img_full.shape[:2] != behave_size[::-1]:
img_full = cv2.resize(img_full, behave_size)
mask_ps_full = cv2.resize(mask_ps_full, behave_size)
mask_obj_full = cv2.resize(mask_obj_full, behave_size)
return img_full, mask_ps_full, mask_obj_full
class DemoDataset(BaseDataset):
def __init__(self, data_paths, input_size=(224, 224),
std_coverage=3.5, # used to estimate camera translation
):
super().__init__(data_paths, input_size)
self.std_coverage = std_coverage
def __len__(self):
return len(self.data_paths)
def __getitem__(self, idx):
rgb_file = self.data_paths[idx]
mask_hum, mask_obj = self.load_masks(rgb_file)
rgb_full = cv2.imread(rgb_file)[:, :, ::-1]
return self.image2dict(mask_hum, mask_obj, rgb_full, rgb_file)
def image2dict(self, mask_hum, mask_obj, rgb_full, rgb_file=None):
"do all the necessary preprocessing for images"
if rgb_full.shape[:2] != mask_obj.shape[:2]:
raise ValueError(f"The given object mask shape {mask_obj.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
if rgb_full.shape[:2] != mask_hum.shape[:2]:
raise ValueError(f"The given human mask shape {mask_hum.shape[:2]} does not match the RGB image shape {rgb_full.shape[:2]}")
if rgb_full.shape[:2] not in [(1080, 1920), (1536, 2048)]:
# crop and resize the image to behave image size
print(f"Recropping the input image and masks for {rgb_file}")
rgb_full, mask_hum, mask_obj = recrop_input(rgb_full, mask_hum, mask_obj)
color_h, color_w = rgb_full.shape[:2]
# Input to the first stage model: human + object crop
Kroi, objmask_fullcrop, psmask_fullcrop, rgb_fullcrop = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_hum, mask_obj],
1.00)
# Input to the second stage model: human and object crops
Kroi_h, masko_hum, maskh_hum, rgb_hum = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_hum, mask_hum], 1.05)
Kroi_o, masko_obj, maskh_obj, rgb_obj = self.crop_full_image(mask_hum.copy(),
mask_obj.copy(),
rgb_full.copy(),
[mask_obj, mask_obj], 1.5)
# Estimate camera translation
cent_transform = np.eye(4) # the transform applied to the mesh that moves it back to kinect camera frame
bmin_ho, bmax_ho = masks2bbox([mask_hum, mask_obj])
crop_size_ho = int(np.max(bmax_ho - bmin_ho) * 1.0)
if crop_size_ho % 2 == 1:
crop_size_ho += 1 # make sure it is an even number
is_behave = self.is_behave_dataset(rgb_full.shape[1])
if rgb_full.shape[1] not in [2048, 1920]:
raise ValueError('the image is not normalized to BEHAVE or ICAP size!')
indices = np.indices(rgb_full.shape[:2])
if np.sum(mask_obj > 127) < 5:
raise ValueError(f'not enough object mask found for {rgb_file}')
pts_h = np.stack([indices[1][mask_hum > 127], indices[0][mask_hum > 127]], -1)
pts_o = np.stack([indices[1][mask_obj > 127], indices[0][mask_obj > 127]], -1)
proj_cent_est = (np.mean(pts_h, 0) + np.mean(pts_o, 0)) / 2. # heuristic to obtain 2d projection center
transl_estimate = compute_translation(proj_cent_est, crop_size_ho, is_behave, self.std_coverage)
cent_transform[:3, 3] = transl_estimate / 7.0
radius = 0.5 # don't do normalization anymore
cent = transl_estimate / 7.0
comb = np.matmul(self.opencv2py3d, cent_transform)
R = torch.from_numpy(comb[:3, :3]).float()
T = torch.from_numpy(comb[:3, 3]).float() / (radius * 2)
data_dict = {
"R": R,
"T": T,
"K": torch.from_numpy(Kroi).float(),
"T_ho": torch.from_numpy(cent).float(), # translation for H+O
"image_path": rgb_file,
"image_size_hw": torch.tensor(self.input_size),
"images": torch.from_numpy(rgb_fullcrop).float().permute(2, 0, 1),
"masks": torch.from_numpy(np.stack([psmask_fullcrop, objmask_fullcrop], 0)).float(),
'orig_image_size': torch.tensor([color_h, color_w]),
# Human input to stage 2
"images_hum": torch.from_numpy(rgb_hum).float().permute(2, 0, 1),
"masks_hum": torch.from_numpy(np.stack([maskh_hum, masko_hum], 0)).float(),
"K_hum": torch.from_numpy(Kroi_h).float(),
# Object input to stage 2
"images_obj": torch.from_numpy(rgb_obj).float().permute(2, 0, 1),
"masks_obj": torch.from_numpy(np.stack([maskh_obj, masko_obj], 0)).float(),
"K_obj": torch.from_numpy(Kroi_o).float(),
# some normalization parameters
"gt_trans": cent,
'radius': radius,
"estimated_trans": transl_estimate,
}
return data_dict
def image2batch(self, rgb, mask_hum, mask_obj):
"""
given input image, convert it into a batch object ready for model inference
:param rgb: (h, w, 3), np array
:param mask_hum: (h, w, 3), np array
:param mask_obj: (h, w, 3), np array
:return:
"""
mask_hum = np.mean(mask_hum, -1)
mask_obj = np.mean(mask_obj, -1)
data_dict = self.image2dict(mask_hum, mask_obj, rgb, 'input image')
# convert dict to list
new_dict = {k:[v] for k, v in data_dict.items()}
return new_dict