# -*- coding: utf-8 -*- import os import cv2 import torch import numpy as np from PIL import Image import os.path as osp import copy from pycocotools.coco import COCO from torch.utils.data import Dataset from torch.utils.data import DataLoader import sys import math import scipy.io as sio import random sys.path.append('..') from util import vis_tool from scipy import ndimage import torchvision.transforms as transforms joint_select = np.array([0, 1, 3, 5, 6, 7, 9, 11, 12, 13, 15, 17, 18, 19, 21, 23, 24, 25, 27, 28, 32, 30, 31]) # calculate = [0, 2, 4, 6, 8, 10, 12, 14, 16, 17, 18, 21, 22, 20] calculate = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] DexYCB2MANO = [ 0, 5, 6, 7, 9, 10, 11, 17, 18, 19, 13, 14, 15, 1, 2, 3, 8, 12, 20, 16, 4 ] HO3D2MANO = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 20, 19, 16] xrange = range def calculateCoM(dpt, minDepth, maxDepth): """ Calculate the center of mass :param dpt: depth image :return: (x,y,z) center of mass """ dc = dpt.copy() dc[dc < minDepth] = 0 dc[dc > maxDepth] = 0 cc = ndimage.measurements.center_of_mass(dc > 0) num = np.count_nonzero(dc) com = np.array((cc[1] * num, cc[0] * num, dc.sum()), np.float64) if num == 0: return np.array((300, 300, 500), np.float64) else: return com / num def pixel2world(x, y, z, paras): fx, fy, fu, fv = paras worldX = (x - fu) * z / fx worldY = (fv - y) * z / fy return worldX, worldY def pixel2world_noflip(x, y, z, paras): fx, fy, fu, fv = paras worldX = (x - fu) * z / fx worldY = (y - fv) * z / fy return worldX, worldY def world2pixel(x, y, z, paras): fx, fy, fu, fv = paras pixelX = x * fx / z + fu pixelY = fv - y * fy / z return pixelX, pixelY def rotatePoint2D(p1, center, angle): """ Rotate a point in 2D around center :param p1: point in 2D (u,v,d) :param center: 2D center of rotation :param angle: angle in deg :return: rotated point """ alpha = angle * np.pi / 180. pp = p1.copy() pp[0:2] -= center[0:2] pr = np.zeros_like(pp) pr[0] = pp[0] * np.cos(alpha) - pp[1] * np.sin(alpha) pr[1] = pp[0] * np.sin(alpha) + pp[1] * np.cos(alpha) pr[2] = pp[2] ps = pr ps[0:2] += center[0:2] return ps def rotatePoints2D(pts, center, angle): """ Transform points in 2D coordinates :param pts: point coordinates :param center: 2D center of rotation :param angle: angle in deg :return: rotated points """ ret = pts.copy() for i in range(pts.shape[0]): ret[i] = rotatePoint2D(pts[i], center, angle) return ret def transformPoint2D(pt, M): """ Transform point in 2D coordinates :param pt: point coordinates :param M: transformation matrix :return: transformed point """ pt2 = np.dot(np.asarray(M).reshape((3, 3)), np.asarray([pt[0], pt[1], 1])) return np.asarray([pt2[0] / pt2[2], pt2[1] / pt2[2]]) def transformPoints2D(pts, M): """ Transform points in 2D coordinates :param pts: point coordinates :param M: transformation matrix :return: transformed points """ ret = pts.copy() for i in range(pts.shape[0]): ret[i, 0:2] = transformPoint2D(pts[i, 0:2], M) return ret def nyu_reader(img_path): img = cv2.imread(img_path) depth = np.asarray(img[:, :, 0] + img[:, :, 1] * 256, dtype=np.float32) return depth def icvl_reader(img_path): img = Image.open(img_path) # open image assert len(img.getbands()) == 1 # ensure depth image depth = np.asarray(img, np.float32) return depth def msra_reader(image_name, para): f = open(image_name, 'rb') data = np.fromfile(f, dtype=np.uint32) width, height, left, top, right, bottom = data[:6] depth = np.zeros((height, width), dtype=np.float32) f.seek(4 * 6) data = np.fromfile(f, dtype=np.float32) depth[top:bottom, left:right] = np.reshape(data, (bottom - top, right - left)) depth_pcl = np.reshape(data, (bottom - top, right - left)) # convert to world imgHeight, imgWidth = depth_pcl.shape hand_3d = np.zeros([3, imgHeight * imgWidth]) d2Output_x = np.tile(np.arange(imgWidth), (imgHeight, 1)).reshape(imgHeight, imgWidth).astype('float64') + left d2Output_y = np.repeat(np.arange(imgHeight), imgWidth).reshape(imgHeight, imgWidth).astype('float64') + top hand_3d[0], hand_3d[1] = pixel2world(d2Output_x.reshape(-1), d2Output_y.reshape(-1), depth_pcl.reshape(-1), para) hand_3d[2] = depth_pcl.reshape(-1) valid = np.arange(0, imgWidth * imgHeight) valid = valid[(hand_3d[0, :] != 0) | (hand_3d[1, :] != 0) | (hand_3d[2, :] != 0)] handpoints = hand_3d[:, valid].transpose(1, 0) return depth, handpoints def hands17_reader(img_path): img = Image.open(img_path) # open image assert len(img.getbands()) == 1 # ensure depth image depth = np.asarray(img, np.float32) return depth from sklearn.decomposition import PCA class loader(Dataset): def __init__(self, root_dir, phase, img_size, center_type, dataset_name): self.rng = np.random.RandomState(23455) self.dataset_name = dataset_name self.root_dir = root_dir self.phase = phase self.img_size = img_size self.center_type = center_type self.allJoints = False self.pca = PCA(n_components=3) self.sample_num = 1024 # numpy def jointImgTo3D(self, uvd, paras=None, flip=None): if isinstance(paras, tuple): fx, fy, fu, fv = paras else: fx, fy, fu, fv = self.paras if flip == None: flip = self.flip ret = np.zeros_like(uvd, np.float32) if len(ret.shape) == 1: ret[0] = (uvd[0] - fu) * uvd[2] / fx ret[1] = flip * (uvd[1] - fv) * uvd[2] / fy ret[2] = uvd[2] elif len(ret.shape) == 2: ret[:, 0] = (uvd[:, 0] - fu) * uvd[:, 2] / fx ret[:, 1] = flip * (uvd[:, 1] - fv) * uvd[:, 2] / fy ret[:, 2] = uvd[:, 2] else: ret[:, :, 0] = (uvd[:, :, 0] - fu) * uvd[:, :, 2] / fx ret[:, :, 1] = flip * (uvd[:, :, 1] - fv) * uvd[:, :, 2] / fy ret[:, :, 2] = uvd[:, :, 2] return ret def joint3DToImg(self, xyz, paras=None, flip=1): if isinstance(paras, tuple): fx, fy, fu, fv = paras else: fx, fy, fu, fv = self.paras if flip == None: flip = self.flip ret = np.zeros_like(xyz, np.float32) if len(ret.shape) == 1: ret[0] = (xyz[0] * fx / xyz[2] + fu) ret[1] = (flip * xyz[1] * fy / xyz[2] + fv) ret[2] = xyz[2] elif len(ret.shape) == 2: ret[:, 0] = (xyz[:, 0] * fx / xyz[:, 2] + fu) ret[:, 1] = (flip * xyz[:, 1] * fy / xyz[:, 2] + fv) ret[:, 2] = xyz[:, 2] else: ret[:, :, 0] = (xyz[:, :, 0] * fx / xyz[:, :, 2] + fu) ret[:, :, 1] = (flip * xyz[:, :, 1] * fy / xyz[:, :, 2] + fv) ret[:, :, 2] = xyz[:, :, 2] return ret # tensor def pointsImgTo3D(self, point_uvd, paras, flip=None): if flip == None: flip = self.flip point_xyz = torch.zeros_like(point_uvd).to(point_uvd.device) point_xyz[:, :, 0] = (point_uvd[:, :, 0] - paras[:, 2].unsqueeze(1)) * point_uvd[:, :, 2] / paras[:, 0].unsqueeze(1) point_xyz[:, :, 1] = flip * (point_uvd[:, :, 1] - paras[:, 3].unsqueeze(1)) * point_uvd[:, :, 2] / paras[:, 1].unsqueeze( 1) point_xyz[:, :, 2] = point_uvd[:, :, 2] return point_xyz def points3DToImg(self, joint_xyz, para, flip=None): if flip == None: flip = self.flip joint_uvd = torch.zeros_like(joint_xyz).to(joint_xyz.device) joint_uvd[:, :, 0] = ( joint_xyz[:, :, 0] * para[:, 0].unsqueeze(1) / (joint_xyz[:, :, 2] + 1e-8) + para[:, 2].unsqueeze( 1)) joint_uvd[:, :, 1] = ( flip * joint_xyz[:, :, 1] * para[:, 1].unsqueeze(1) / (joint_xyz[:, :, 2]) + para[:, 3].unsqueeze( 1)) joint_uvd[:, :, 2] = joint_xyz[:, :, 2] return joint_uvd # augment def comToBounds(self, com, size, paras): fx, fy, fu, fv = paras zstart = com[2] - size[2] / 2. zend = com[2] + size[2] / 2. xstart = int(np.floor((com[0] * com[2] / fx - size[0] / 2.) / com[2] * fx + 0.5)) xend = int(np.floor((com[0] * com[2] / fx + size[0] / 2.) / com[2] * fx + 0.5)) ystart = int(np.floor((com[1] * com[2] / fy - size[1] / 2.) / com[2] * fy + 0.5)) yend = int(np.floor((com[1] * com[2] / fy + size[1] / 2.) / com[2] * fy + 0.5)) return xstart, xend, ystart, yend, zstart, zend def comToTransform(self, com, size, dsize, paras): """ Calculate affine transform from crop :param com: center of mass, in image coordinates (x,y,z), z in mm :param size: (x,y,z) extent of the source crop volume in mm :return: affine transform """ xstart, xend, ystart, yend, _, _ = self.comToBounds(com, size, paras) trans = np.eye(3) trans[0, 2] = -xstart trans[1, 2] = -ystart wb = (xend - xstart) hb = (yend - ystart) if wb > hb: scale = np.eye(3) * dsize[0] / float(wb) sz = (dsize[0], hb * dsize[0] / wb) else: scale = np.eye(3) * dsize[1] / float(hb) sz = (wb * dsize[1] / hb, dsize[1]) scale[2, 2] = 1 # ori # xstart = int(np.floor(dsize[0] / 2. - sz[1] / 2.)) # ystart = int(np.floor(dsize[1] / 2. - sz[0] / 2.)) # change by pengfeiren xstart = int(np.floor(dsize[0] / 2. - sz[0] / 2.)) ystart = int(np.floor(dsize[1] / 2. - sz[1] / 2.)) off = np.eye(3) off[0, 2] = xstart off[1, 2] = ystart return np.dot(off, np.dot(scale, trans)) def recropHand(self, crop, M, Mnew, target_size, paras, background_value=0., nv_val=0., thresh_z=True, com=None, size=(250, 250, 250)): flags = cv2.INTER_NEAREST if len(target_size) > 2: target_size = target_size[0:2] warped = cv2.warpPerspective(crop, np.dot(M, Mnew), target_size, flags=flags, borderMode=cv2.BORDER_CONSTANT, borderValue=float(background_value)) # warped[np.isclose(warped, nv_val)] = background_value # Outliers will appear on the edge if thresh_z: warped[warped < nv_val] = background_value if thresh_z is True: assert com is not None _, _, _, _, zstart, zend = self.comToBounds(com, size, paras) msk1 = np.logical_and(warped < zstart, warped != 0) msk2 = np.logical_and(warped > zend, warped != 0) warped[msk1] = zstart warped[msk2] = 0. # backface is at 0, it is set later return warped def moveCoM(self, dpt, cube, com, off, joints3D, M, paras=None, pad_value=0, thresh_z=True): """ Adjust already cropped image such that a moving CoM normalization is simulated :param dpt: cropped depth image with different CoM :param cube: metric cube of size (sx,sy,sz) :param com: original center of mass, in image coordinates (x,y,z) :param off: offset to center of mass (dx,dy,dz) in 3D coordinates :param joints3D: 3D joint coordinates, cropped to old CoM :param pad_value: value of padding :return: adjusted image, new 3D joint coordinates, new center of mass in image coordinates """ # if offset is 0, nothing to do if np.allclose(off, 0.): return dpt, joints3D, com, M # add offset to com new_com = self.joint3DToImg(self.jointImgTo3D(com, paras) + off, paras) # check for 1/0. if not (np.allclose(com[2], 0.) or np.allclose(new_com[2], 0.)): # scale to original size Mnew = self.comToTransform(new_com, cube, dpt.shape, paras) if len(dpt[dpt > 0]) > 0: new_dpt = self.recropHand(dpt, Mnew, np.linalg.inv(M), dpt.shape, paras, background_value=pad_value, nv_val=np.min(dpt[dpt > 0]) - 1, thresh_z=thresh_z, com=new_com, size=cube) else: new_dpt = self.recropHand(dpt, Mnew, np.linalg.inv(M), dpt.shape, paras, background_value=pad_value, nv_val=- 1, thresh_z=thresh_z, com=new_com, size=cube) # new_dpt = self.recropHand(dpt, Mnew, np.linalg.inv(M), dpt.shape, paras, background_value=pad_value, # nv_val=np.min(dpt[dpt>0])-1, thresh_z=thresh_z, com=new_com, size=cube) else: Mnew = M new_dpt = dpt # adjust joint positions to new CoM new_joints3D = joints3D + self.jointImgTo3D(com, paras) - self.jointImgTo3D(new_com, paras) return new_dpt, new_joints3D, new_com, Mnew def rotateHand(self, dpt, cube, com, rot, joints3D, paras=None, pad_value=0, thresh_z=True): """ Rotate hand virtually in the image plane by a given angle :param dpt: cropped depth image with different CoM :param cube: metric cube of size (sx,sy,sz) :param com: original center of mass, in image coordinates (x,y,z) :param rot: rotation angle in deg :param joints3D: original joint coordinates, in 3D coordinates (x,y,z) :param pad_value: value of padding :return: adjusted image, new 3D joint coordinates, rotation angle in XXX """ # if rot is 0, nothing to do if np.allclose(rot, 0.): return dpt, joints3D, rot rot = np.mod(rot, 360) M = cv2.getRotationMatrix2D((dpt.shape[1] // 2, dpt.shape[0] // 2), -rot, 1) flags = cv2.INTER_NEAREST new_dpt = cv2.warpAffine(dpt, M, (dpt.shape[1], dpt.shape[0]), flags=flags, borderMode=cv2.BORDER_CONSTANT, borderValue=pad_value) if thresh_z and len(dpt[dpt > 0]) > 0: new_dpt[new_dpt < (np.min(dpt[dpt > 0]) - 1)] = 0 com3D = self.jointImgTo3D(com, paras) joint_2D = self.joint3DToImg(joints3D + com3D, paras) data_2D = np.zeros_like(joint_2D) for k in xrange(data_2D.shape[0]): data_2D[k] = rotatePoint2D(joint_2D[k], com[0:2], rot) new_joints3D = (self.jointImgTo3D(data_2D, paras) - com3D) return new_dpt, new_joints3D, rot def scaleHand(self, dpt, cube, com, sc, joints3D, M, paras, pad_value=0, thresh_z=True): """ Virtually scale the hand by applying different cube :param dpt: cropped depth image with different CoM :param cube: metric cube of size (sx,sy,sz) :param com: original center of mass, in image coordinates (x,y,z) :param sc: scale factor for cube :param joints3D: 3D joint coordinates, cropped to old CoM :param pad_value: value of padding :return: adjusted image, new 3D joint coordinates, new center of mass in image coordinates """ # if scale is 1, nothing to do if np.allclose(sc, 1.): return dpt, joints3D, cube, M new_cube = [s * sc for s in cube] # check for 1/0. if not np.allclose(com[2], 0.): # scale to original size Mnew = self.comToTransform(com, new_cube, dpt.shape, paras) if len(dpt[dpt > 0]) > 0: new_dpt = self.recropHand(dpt, Mnew, np.linalg.inv(M), dpt.shape, paras, background_value=pad_value, nv_val=np.min(dpt[dpt > 0]) - 1, thresh_z=thresh_z, com=com, size=cube) else: new_dpt = self.recropHand(dpt, Mnew, np.linalg.inv(M), dpt.shape, paras, background_value=pad_value, nv_val=- 1, thresh_z=thresh_z, com=com, size=cube) else: Mnew = M new_dpt = dpt new_joints3D = joints3D return new_dpt, new_joints3D, new_cube, Mnew def rand_augment(self, sigma_com=None, sigma_sc=None, rot_range=None): if sigma_com is None: sigma_com = 35. if sigma_sc is None: sigma_sc = 0.05 if rot_range is None: rot_range = 180. # mode = self.rng.randint(0, len(self.aug_modes)) # off = self.rng.randn(3) * sigma_com # +-px/mm # rot = self.rng.uniform(-rot_range, rot_range) # sc = abs(1. + self.rng.randn() * sigma_sc) # # mode = np.random.randint(0, len(self.aug_modes)) # off = np.random.randn(3) * sigma_com # +-px/mm # rot = np.random.uniform(-rot_range, rot_range) # sc = abs(1. + np.random.randn() * sigma_sc) mode = random.randint(0, len(self.aug_modes) - 1) off = np.array([random.uniform(-1, 1) for a in range(3)]) * sigma_com # +-px/mm rot = random.uniform(-rot_range, rot_range) sc = abs(1. + random.uniform(-1, 1) * sigma_sc) return mode, off, rot, sc def augmentCrop_RGB(self, img, gt3Dcrop, com, cube, M, mode, off, rot, sc, paras=None, normZeroOne=False): """ Commonly used function to augment hand poses :param img: image :param gt3Dcrop: 3D annotations :param com: center of mass in image coordinates (x,y,z) :param cube: cube :param aug_modes: augmentation modes :param hd: hand detector :param normZeroOne: normalization :param sigma_com: sigma of com noise :param sigma_sc: sigma of scale noise :param rot_range: rotation range in degrees :return: image, 3D annotations(unnormal), com(image coordinates), cube """ # assert len(img.shape) == 3 assert isinstance(self.aug_modes, list) # premax = img.max() if self.aug_modes[mode] == 'com': rot = 0. sc = 1. imgRGB, new_joints3D, com, M = self.moveCoM(img.astype('float32'), cube, com, off, gt3Dcrop, M, paras, pad_value=0, thresh_z=False) elif self.aug_modes[mode] == 'rot': off = np.zeros((3,)) sc = 1. imgRGB, new_joints3D, rot = self.rotateHand(img.astype('float32'), cube, com, rot, gt3Dcrop, paras, pad_value=0, thresh_z=False) elif self.aug_modes[mode] == 'sc': off = np.zeros((3,)) rot = 0. imgRGB, new_joints3D, cube, M = self.scaleHand(img.astype('float32'), cube, com, sc, gt3Dcrop, M, paras, pad_value=0, thresh_z=False) elif self.aug_modes[mode] == 'none': off = np.zeros((3,)) sc = 1. rot = 0. imgRGB = img new_joints3D = gt3Dcrop else: raise NotImplementedError() # cv2.imwrite('./example_vis/0_rgb.png', imgRGB.astype(np.uint8)) return imgRGB, None, new_joints3D, np.asarray(cube), com, M, rot def augmentCrop(self, img, gt3Dcrop, com, cube, M, mode, off, rot, sc, paras=None, normZeroOne=False): """ Commonly used function to augment hand poses :param img: image :param gt3Dcrop: 3D annotations :param com: center of mass in image coordinates (x,y,z) :param cube: cube :param aug_modes: augmentation modes :param hd: hand detector :param normZeroOne: normalization :param sigma_com: sigma of com noise :param sigma_sc: sigma of scale noise :param rot_range: rotation range in degrees :return: image, 3D annotations(unnormal), com(image coordinates), cube """ assert len(img.shape) == 2 assert isinstance(self.aug_modes, list) premax = img.max() if np.max(img) == 0: imgD = img new_joints3D = gt3Dcrop elif self.aug_modes[mode] == 'com': rot = 0. sc = 1. imgD, new_joints3D, com, M = self.moveCoM(img.astype('float32'), cube, com, off, gt3Dcrop, M, paras, pad_value=0) elif self.aug_modes[mode] == 'rot': off = np.zeros((3,)) sc = 1. imgD, new_joints3D, rot = self.rotateHand(img.astype('float32'), cube, com, rot, gt3Dcrop, paras, pad_value=0) elif self.aug_modes[mode] == 'sc': off = np.zeros((3,)) rot = 0. imgD, new_joints3D, cube, M = self.scaleHand(img.astype('float32'), cube, com, sc, gt3Dcrop, M, paras, pad_value=0) elif self.aug_modes[mode] == 'none': off = np.zeros((3,)) sc = 1. rot = 0. imgD = img new_joints3D = gt3Dcrop else: raise NotImplementedError() imgD = self.normalize_img(premax, imgD, com, cube) return imgD, None, new_joints3D, np.asarray(cube), com, M, rot def normalize_img(self, premax, imgD, com, cube): imgD[imgD == premax] = com[2] + (cube[2] / 2.) imgD[imgD == 0] = com[2] + (cube[2] / 2.) imgD[imgD >= com[2] + (cube[2] / 2.)] = com[2] + (cube[2] / 2.) imgD[imgD <= com[2] - (cube[2] / 2.)] = com[2] - (cube[2] / 2.) imgD -= com[2] imgD /= (cube[2] / 2.) return imgD def Crop_Image_deep_pp_RGB(self, depth, com, size, dsize, paras): """ Crop area of hand in 3D volumina, scales inverse to the distance of hand to camera :param com: center of mass, in image coordinates (x,y,z), z in mm :param size: (x,y,z) extent of the source crop volume in mm :param dsize: (x,y) extent of the destination size :return: cropped hand image, transformation matrix for joints, CoM in image coordinates """ if len(size) != 3: raise ValueError("Size must be 3D and bounding box") # calculate boundaries xstart, xend, ystart, yend, zstart, zend = self.comToBounds(com, size, paras) # crop patch from source cropped = self.getCrop(depth, xstart, xend, ystart, yend, zstart, zend, thresh_z=False) # resize to same size wb = (xend - xstart) hb = (yend - ystart) if wb > hb: sz = (dsize[0], int(hb * dsize[0] / wb)) else: sz = (int(wb * dsize[1] / hb), dsize[1]) trans = np.eye(3) trans[0, 2] = -xstart trans[1, 2] = -ystart if cropped.shape[0] > cropped.shape[1]: scale = np.eye(3) * sz[1] / float(cropped.shape[0]) else: scale = np.eye(3) * sz[0] / float(cropped.shape[1]) scale[2, 2] = 1 # depth resize rz = cv2.resize(cropped, sz, interpolation=cv2.INTER_NEAREST) rgb_size = (dsize[0], dsize[1], 3) ret = np.ones(rgb_size, np.float32) * 0 # use background as filler xstart = int(np.floor(dsize[0] / 2. - rz.shape[1] / 2.)) xend = int(xstart + rz.shape[1]) ystart = int(np.floor(dsize[1] / 2. - rz.shape[0] / 2.)) yend = int(ystart + rz.shape[0]) ret[ystart:yend, xstart:xend] = rz # print rz.shape, xstart, ystart off = np.eye(3) off[0, 2] = xstart off[1, 2] = ystart return ret, np.dot(off, np.dot(scale, trans)) # use deep-pp's method def Crop_Image_deep_pp(self, depth, com, size, dsize, paras): """ Crop area of hand in 3D volumina, scales inverse to the distance of hand to camera :param com: center of mass, in image coordinates (x,y,z), z in mm :param size: (x,y,z) extent of the source crop volume in mm :param dsize: (x,y) extent of the destination size :return: cropped hand image, transformation matrix for joints, CoM in image coordinates """ if len(size) != 3 or len(dsize) != 2: raise ValueError("Size must be 3D and dsize 2D bounding box") # calculate boundaries xstart, xend, ystart, yend, zstart, zend = self.comToBounds(com, size, paras) # crop patch from source cropped = self.getCrop(depth, xstart, xend, ystart, yend, zstart, zend) # resize to same size wb = (xend - xstart) hb = (yend - ystart) if wb > hb: sz = (dsize[0], int(hb * dsize[0] / wb)) else: sz = (int(wb * dsize[1] / hb), dsize[1]) trans = np.eye(3) trans[0, 2] = -xstart trans[1, 2] = -ystart if cropped.shape[0] > cropped.shape[1]: scale = np.eye(3) * sz[1] / float(cropped.shape[0]) else: scale = np.eye(3) * sz[0] / float(cropped.shape[1]) scale[2, 2] = 1 # depth resize rz = cv2.resize(cropped, sz, interpolation=cv2.INTER_NEAREST) ret = np.ones(dsize, np.float32) * 0 # use background as filler xstart = int(np.floor(dsize[0] / 2. - rz.shape[1] / 2.)) xend = int(xstart + rz.shape[1]) ystart = int(np.floor(dsize[1] / 2. - rz.shape[0] / 2.)) yend = int(ystart + rz.shape[0]) ret[ystart:yend, xstart:xend] = rz # print rz.shape, xstart, ystart off = np.eye(3) off[0, 2] = xstart off[1, 2] = ystart return ret, np.dot(off, np.dot(scale, trans)) def getCrop(self, depth, xstart, xend, ystart, yend, zstart, zend, thresh_z=True, background=0): """ Crop patch from image :param depth: depth image to crop from :param xstart: start x :param xend: end x :param ystart: start y :param yend: end y :param zstart: start z :param zend: end z :param thresh_z: threshold z values :return: cropped image """ if len(depth.shape) == 2: cropped = depth[max(ystart, 0):min(yend, depth.shape[0]), max(xstart, 0):min(xend, depth.shape[1])].copy() # add pixels that are out of the image in order to keep aspect ratio cropped = np.pad(cropped, ((abs(ystart) - max(ystart, 0), abs(yend) - min(yend, depth.shape[0])), (abs(xstart) - max(xstart, 0), abs(xend) - min(xend, depth.shape[1]))), mode='constant', constant_values=background) elif len(depth.shape) == 3: cropped = depth[max(ystart, 0):min(yend, depth.shape[0]), max(xstart, 0):min(xend, depth.shape[1]), :].copy() # add pixels that are out of the image in order to keep aspect ratio cropped = np.pad(cropped, ((abs(ystart) - max(ystart, 0), abs(yend) - min(yend, depth.shape[0])), (abs(xstart) - max(xstart, 0), abs(xend) - min(xend, depth.shape[1])), (0, 0)), mode='constant', constant_values=background) else: raise NotImplementedError() if thresh_z is True: msk1 = np.logical_and(cropped < zstart, cropped != 0) msk2 = np.logical_and(cropped > zend, cropped != 0) cropped[msk1] = zstart cropped[msk2] = 0. # backface is at 0, it is set later return cropped # tensor def unnormal_joint_img(self, joint_img): device = joint_img.device joint = torch.zeros(joint_img.size()).to(device) joint[:, :, 0:2] = (joint_img[:, :, 0:2] + 1) / 2 * self.img_size joint[:, :, 2] = (joint_img[:, :, 2] + 1) / 2 * self.cube_size[2] return joint def uvd_nl2xyz_tensor(self, uvd, center, m, cube, cam_paras): batch_size, point_num, _ = uvd.size() device = uvd.device cube_size_t = cube.to(device).view(batch_size, 1, 3).repeat(1, point_num, 1) center_t = center.to(device).view(batch_size, 1, 3).repeat(1, point_num, 1) M_t = m.to(device).view(batch_size, 1, 3, 3) M_inverse = torch.linalg.inv(M_t).repeat(1, point_num, 1, 1) uv_unnormal = (uvd[:, :, 0:2] + 1) * (self.img_size / 2) d_unnormal = (uvd[:, :, 2:]) * (cube_size_t[:, :, 2:] / 2.0) + center_t[:, :, 2:] uvd_unnormal = torch.cat((uv_unnormal, d_unnormal), dim=-1) uvd_world = self.get_trans_points(uvd_unnormal, M_inverse) xyz = self.pointsImgTo3D(uvd_world, cam_paras) return xyz def uvd_nl2xyznl_tensor(self, uvd, center, m, cube, cam_paras): batch_size, point_num, _ = uvd.size() device = uvd.device cube_size_t = cube.to(device).view(batch_size, 1, 3).repeat(1, point_num, 1) center_t = center.to(device).view(batch_size, 1, 3).repeat(1, point_num, 1) M_t = m.to(device).view(batch_size, 1, 3, 3) M_inverse = torch.linalg.inv(M_t).repeat(1, point_num, 1, 1) uv_unnormal = (uvd[:, :, 0:2] + 1) * (self.img_size / 2) d_unnormal = (uvd[:, :, 2:]) * (cube_size_t[:, :, 2:] / 2.0) + center_t[:, :, 2:] uvd_unnormal = torch.cat((uv_unnormal, d_unnormal), dim=-1) uvd_world = self.get_trans_points(uvd_unnormal, M_inverse) xyz = self.pointsImgTo3D(uvd_world, cam_paras) xyz_noraml = (xyz - center_t) / (cube_size_t / 2.0) return xyz_noraml def img2anchor_dis(self, joint_uvd, img, center, M, cube, cam_para, gamma=10): ''' :param img: Bx1xWxH Tensor :param feature: BxCxWxH Tensor :return: select_feature: BxCxN ''' device = img.device B, J, _ = joint_uvd.size() B, _, W, H = img.size() joint_xyz = self.uvd_nl2xyznl_tensor(joint_uvd, center, M, cube, cam_para) mesh_x = 2.0 * (torch.arange(W).unsqueeze(1).expand(W, W).float() + 0.5) / W - 1.0 mesh_y = 2.0 * (torch.arange(W).unsqueeze(0).expand(W, W).float() + 0.5) / W - 1.0 coords = torch.stack((mesh_y, mesh_x), dim=0) coords = torch.unsqueeze(coords, dim=0).repeat(B, 1, 1, 1).to(device) img_uvd = torch.cat((coords, img), dim=1).view(B, 3, H * W).permute(0, 2, 1) # img_uvd[img_uvd[:, :,2] == 1] = 10 # B*N*3 img_xyz = self.uvd_nl2xyznl_tensor(img_uvd, center, M, cube, cam_para).unsqueeze(1).repeat(1, J, 1, 1) # B*j*N*3 # distance = torch.sqrt(torch.sum(torch.pow(pcl.unsqueeze(2) - img_xyz.unsqueeze(1), 2), dim=-1) + 1e-8) distance = torch.sum(torch.pow(img_xyz - joint_xyz.unsqueeze(2), 2), dim=-1) # closeness_value = 1 / (distance + 1e-8) # closeness_value_normal = closeness_value / (closeness_value.sum(-1, keepdim=True) + 1e-8) closeness_value_normal = 1 / (gamma * distance + 1) return closeness_value_normal.view(B, J, H, W) def xyz_nl2uvdnl_tensor(self, joint_xyz, center, M, cube_size, cam_paras): device = joint_xyz.device batch_size, joint_num, _ = joint_xyz.size() cube_size_t = cube_size.to(device).view(batch_size, 1, 3).repeat(1, joint_num, 1) center_t = center.to(device).view(batch_size, 1, 3).repeat(1, joint_num, 1) M_t = M.to(device).view(batch_size, 1, 3, 3).repeat(1, joint_num, 1, 1) joint_temp = joint_xyz * cube_size_t / 2.0 + center_t joint_uvd = self.points3DToImg(joint_temp, cam_paras) joint_uvd = self.get_trans_points(joint_uvd, M_t) joint_uv = joint_uvd[:, :, 0:2] / self.img_size * 2.0 - 1 joint_d = (joint_uvd[:, :, 2:] - center_t[:, :, 2:]) / (cube_size_t[:, :, 2:] / 2) joint = torch.cat((joint_uv, joint_d), dim=-1) return joint def get_trans_points(self, joints, M): device = joints.device joints_mat = torch.cat((joints[:, :, 0:2], torch.ones(joints.size(0), joints.size(1), 1).to(device)), dim=-1) joints_trans_xy = torch.matmul(M, joints_mat.unsqueeze(-1)).squeeze(-1)[:, :, 0:2] joints_trans_z = joints[:, :, 2:] return torch.cat((joints_trans_xy, joints_trans_z), dim=-1) def getpcl(self, imgD, com3D, cube, M, cam_para=None): mask = np.isclose(imgD, 1) dpt_ori = imgD * cube[2] / 2.0 + com3D[2] # change the background value dpt_ori[mask] = 0 pcl = (self.depthToPCL(dpt_ori, M, cam_para) - com3D) pcl_num = pcl.shape[0] cube_tile = np.tile(cube / 2.0, pcl_num).reshape([pcl_num, 3]) pcl = pcl / cube_tile return pcl def farthest_point_sample(self, xyz, npoint): N, C = xyz.shape S = npoint if N < S: centroids = np.arange(N) centroids = np.append(centroids, np.random.choice(centroids, size=S - N, replace=False)) else: centroids = np.zeros(S).astype(np.int) distance = np.ones(N) * 1e10 farthest = np.random.randint(0, S) for i in range(S): centroids[i] = farthest centroid = xyz[farthest, :] dist = np.sum((xyz - centroid) ** 2, -1) mask = dist < distance distance[mask] = dist[mask] farthest = distance.argmax() return np.unique(centroids) def depthToPCL(self, dpt, T, paras=None, background_val=0.): if isinstance(paras, tuple): fx, fy, fu, fv = paras else: fx, fy, fu, fv = self.paras # get valid points and transform pts = np.asarray(np.where(~np.isclose(dpt, background_val))).transpose() pts = np.concatenate([pts[:, [1, 0]] + 0.5, np.ones((pts.shape[0], 1), dtype='float32')], axis=1) pts = np.dot(np.linalg.inv(np.asarray(T)), pts.T).T pts = (pts[:, 0:2] / pts[:, 2][:, None]).reshape((pts.shape[0], 2)) # replace the invalid data depth = dpt[(~np.isclose(dpt, background_val))] # get x and y data in a vectorized way row = (pts[:, 0] - fu) / fx * depth col = self.flip * (pts[:, 1] - fv) / fy * depth # combine x,y,depth return np.column_stack((row, col, depth)) def pca_point(self, pcl, joint): self.pca.fit(pcl) coeff = self.pca.components_.T # if coeff[1, 0] < 0: # coeff[:, 0] = -coeff[:, 0] # if coeff[2, 2] < 0: # coeff[:, 2] = -coeff[:, 2] coeff[:, 1] = np.cross(coeff[:, 2], coeff[:, 0]) points_rotation = np.dot(pcl, coeff) joint_rotation = np.dot(joint, coeff) return points_rotation, joint_rotation, coeff def img2pcl_index_softmax(self, pcl, img, center, M, cube, cam_para, select_num=64, scale=30): ''' :param pcl: BxNx3 Tensor :param img: Bx1xWxH Tensor :param feature: BxCxWxH Tensor :return: select_feature: BxCxN ''' device = pcl.device B, N, _ = pcl.size() B, _, W, H = img.size() mesh_x = 2.0 * (torch.arange(W).unsqueeze(1).expand(W, W).float() + 0.5) / W - 1.0 mesh_y = 2.0 * (torch.arange(W).unsqueeze(0).expand(W, W).float() + 0.5) / W - 1.0 coords = torch.stack((mesh_y, mesh_x), dim=0) coords = torch.unsqueeze(coords, dim=0).repeat(B, 1, 1, 1).to(device) img_uvd = torch.cat((coords, img), dim=1).view(B, 3, H * W).permute(0, 2, 1) img_xyz = self.uvd_nl2xyznl_tensor(img_uvd, center, M, cube, cam_para) distance = torch.sum(torch.pow(pcl.unsqueeze(2) - img_xyz.unsqueeze(1), 2), dim=-1) distance_value, distance_index = torch.topk(distance, select_num, largest=False) distance_value = torch.sqrt(distance_value + 1e-8) distance_value = distance_value - distance_value.min(dim=-1, keepdim=True)[0] closeness_value = 1 - distance_value / distance_value.max(dim=-1, keepdim=True)[0] # closeness_value = torch.clamp(0.1 - distance_value, 0, 1) closeness_value_normal = torch.softmax(closeness_value * scale, dim=-1) return closeness_value_normal, distance_index def img2pcl_index(self, pcl, img, center, M, cube, cam_para, select_num=9): ''' :param pcl: BxNx3 Tensor :param img: Bx1xWxH Tensor :param feature: BxCxWxH Tensor :return: select_feature: BxCxN ''' device = pcl.device B, N, _ = pcl.size() B, _, W, H = img.size() mesh_x = 2.0 * (torch.arange(W).unsqueeze(1).expand(W, W).float() + 0.5) / W - 1.0 mesh_y = 2.0 * (torch.arange(W).unsqueeze(0).expand(W, W).float() + 0.5) / W - 1.0 coords = torch.stack((mesh_y, mesh_x), dim=0) coords = torch.unsqueeze(coords, dim=0).repeat(B, 1, 1, 1).to(device) img_uvd = torch.cat((coords, img), dim=1).view(B, 3, H * W).permute(0, 2, 1) img_xyz = self.uvd_nl2xyznl_tensor(img_uvd, center, M, cube, cam_para) # distance = torch.sqrt(torch.sum(torch.pow(pcl.unsqueeze(2) - img_xyz.unsqueeze(1), 2), dim=-1) + 1e-8) distance = torch.sum(torch.pow(pcl.unsqueeze(2) - img_xyz.unsqueeze(1), 2), dim=-1) distance_value, distance_index = torch.topk(distance, select_num, largest=False) # version 1 closeness_value = 1 / (distance_value + 1e-8) closeness_value_normal = closeness_value / (closeness_value.sum(-1, keepdim=True) + 1e-8) # version 2 # distance_value = torch.sqrt(distance_value + 1e-8) # distance_value = distance_value - distance_value.min(dim=-1,keepdim=True)[0] # closeness_value = 1 - distance_value / distance_value.max(dim=-1,keepdim=True)[0] # closeness_value_normal = torch.softmax(closeness_value*30, dim=-1) return closeness_value_normal, distance_index def pcl2img_index(self, pcl, img_size, center, M, cube, cam_para, select_num=9): ''' :param pcl: BxNx3 Tensor :param img: Bx1xWxH Tensor :param feature: BxCxWxH Tensor :return: select_feature: BxCxN ''' device = pcl.device B, N, _ = pcl.size() pcl_uvd = (self.xyz_nl2uvdnl_tensor(pcl, center, M, cube, cam_para)[:, :, :2] + 1) / 2 * img_size mesh_x = (torch.arange(img_size).unsqueeze(1).expand(img_size, img_size).float() + 0.5) mesh_y = (torch.arange(img_size).unsqueeze(0).expand(img_size, img_size).float() + 0.5) coords = torch.stack((mesh_y, mesh_x), dim=0) coords = torch.unsqueeze(coords, dim=0).repeat(B, 1, 1, 1).to(device) coords = coords.view(B, 2, img_size * img_size).permute(0, 2, 1) distance = torch.sqrt(torch.sum(torch.pow(pcl_uvd.unsqueeze(2) - coords.unsqueeze(1), 2), dim=-1) + 1e-8) distance_value, distance_index = torch.topk(distance, select_num, largest=False) closeness_value = 1 / (distance_value + 1e-8) closeness_value_normal = closeness_value / (closeness_value.sum(-1, keepdim=True) + 1e-8) return closeness_value_normal, distance_index def img2pcl(self, img): ''' :param img: Bx1xWxH Tensor ''' device = img.device B, _, W, H = img.size() mesh_x = 2.0 * (torch.arange(W).unsqueeze(1).expand(W, W).float() + 0.5) / W - 1.0 mesh_y = 2.0 * (torch.arange(W).unsqueeze(0).expand(W, W).float() + 0.5) / W - 1.0 coords = torch.stack((mesh_y, mesh_x), dim=0) coords = torch.unsqueeze(coords, dim=0).repeat(B, 1, 1, 1).to(device) img_uvd = torch.cat((coords, img), dim=1).view(B, 3, H * W).permute(0, 2, 1) return img_uvd def read_modelPara(self, data_rt, view): theta = np.loadtxt(data_rt + '/posePara_lm_collosion/' + self.dataset_name + '-' + self.phase + '-' + str( view) + '-pose.txt').reshape(-1, 45) quat = np.loadtxt(data_rt + '/posePara_lm_collosion/' + self.dataset_name + '-' + self.phase + '-' + str( view) + '-glb.txt').reshape(-1, 3) scale = np.loadtxt(data_rt + '/posePara_lm_collosion/' + self.dataset_name + '-' + self.phase + '-' + str( view) + '-scale.txt').reshape(-1, 1) trans = np.loadtxt(data_rt + '/posePara_lm_collosion/' + self.dataset_name + '-' + self.phase + '-' + str( view) + '-trans.txt').reshape(-1, 3) shape = np.loadtxt(data_rt + '/posePara_lm_collosion/' + self.dataset_name + '-' + self.phase + '-' + str( view) + '-shape.txt').reshape(-1, 10) model_para = np.concatenate([quat, theta, shape, scale, trans], axis=-1) return model_para class DexYCBDataset(loader): def __init__(self, setup, split, root_dir, img_size=128, aug_para=[10, 0.2, 180], input_modal='RGBD'): super(DexYCBDataset, self).__init__(root_dir, split, img_size, 'joint_mean', 'DexYCB') # self.writer = SummaryWriter('runs/DexYCB_fliter') self.setup = setup self.split = split self.aug_para = aug_para self.cube_size = [250, 250, 250] self.aug_modes = ['rot', 'com', 'sc', 'none'] self.flip = 1 self.root_dir = root_dir + '/DexYCB' self.annot_path = osp.join(self.root_dir, 'annotations') self.input_modal = input_modal self.transform = transforms.ToTensor() self.datalist = self.load_data() print('loading finish') print('len: %d' % (len(self.datalist))) def load_data(self): db = COCO(osp.join(self.annot_path, "DEX_YCB_{}_{}_data.json".format(self.setup, self.split))) # user_name = self.root_dir.split('/')[2] datalist = [] for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_path = osp.join(self.root_dir, img['color_file_name']) img_path = img_path.replace('/home/pfren/dataset/', '/home/cyc/pycharm/data/') # img_path = osp.join(self.root_dir, img['file_name']) img_shape = (img['height'], img['width']) if self.split == 'train': # joints_coord_cam = np.array(ann['joint_3d'], dtype=np.float32) # meter joints_coord_cam = np.array(ann['joints_coord_cam'], dtype=np.float32) / 1000 # meter cam_param = {k: np.array(v, dtype=np.float32) for k, v in ann['cam_param'].items()} hand_type = ann['hand_type'] if joints_coord_cam.sum() == -63: continue mano_pose = np.array(ann['mano_param']['pose'], dtype=np.float32) mano_shape = np.array(ann['mano_param']['shape'], dtype=np.float32) mano_trans = np.array(ann['mano_param']['trans'], dtype=np.float32) data = {"img_path": img_path, "img_shape": img_shape, "joints_coord_cam": joints_coord_cam, "cam_param": cam_param, "mano_pose": mano_pose, "mano_shape": mano_shape, 'mano_trans': mano_trans, "hand_type": hand_type} else: # joints_coord_cam = np.array(ann['joint_3d'], dtype=np.float32) joints_coord_cam = np.array(ann['joints_coord_cam'], dtype=np.float32) / 1000 # m hand_type = ann['hand_type'] if joints_coord_cam.sum() == -63: continue mano_pose = np.array(ann['mano_param']['pose'], dtype=np.float32) mano_shape = np.array(ann['mano_param']['shape'], dtype=np.float32) mano_trans = np.array(ann['mano_param']['trans'], dtype=np.float32) cam_param = {k: np.array(v, dtype=np.float32) for k, v in ann['cam_param'].items()} data = {"img_path": img_path, "img_shape": img_shape, "joints_coord_cam": joints_coord_cam, "mano_pose": mano_pose, "mano_shape": mano_shape, 'mano_trans': mano_trans, "cam_param": cam_param, "image_id": image_id, 'hand_type': hand_type} datalist.append(data) return datalist def __len__(self): return len(self.datalist) def __getitem__(self, idx): # idx = idx + 11088 data = copy.deepcopy(self.datalist[idx]) img_path, img_shape = data['img_path'], data['img_shape'] hand_type = data['hand_type'] do_flip = (hand_type == 'left') if 'RGB' in self.input_modal: # rgb = load_img(img_path.replace('png', 'jpg')) rgb = cv2.imread(img_path) if not isinstance(rgb, np.ndarray): raise IOError("Fail to read %s" % img_path) depth = cv2.imread(img_path.replace('color_', 'aligned_depth_to_color_').replace('jpg', 'png'), cv2.IMREAD_ANYDEPTH) intrinsics = data['cam_param'] cam_para = (intrinsics['focal'][0], intrinsics['focal'][1], intrinsics['princpt'][0], intrinsics['princpt'][1]) joint_xyz = data['joints_coord_cam'].reshape([21, 3])[DexYCB2MANO, :] * 1000 joint_uvd = self.joint3DToImg(joint_xyz, cam_para) mano_pose, mano_shape, mano_trans = data['mano_pose'], data['mano_shape'], data['mano_trans'] name_list = img_path.split('/') dir_1, dir_2, dir_3, dir_4 = name_list[-4], name_list[-3], name_list[-2], name_list[-1] if do_flip: if 'RGB' in self.input_modal: rgb = rgb[:, ::-1].copy() depth = depth[:, ::-1].copy() joint_uvd[:, 0] = img_shape[1] - joint_uvd[:, 0] - 1 joint_xyz = self.jointImgTo3D(joint_uvd, cam_para) center_xyz = joint_xyz.mean(0) gt3Dcrop = joint_xyz - center_xyz center_uvd = self.joint3DToImg(center_xyz, cam_para) depth_crop, trans = self.Crop_Image_deep_pp(depth, center_uvd, self.cube_size, (self.img_size, self.img_size), cam_para) if 'RGB' in self.input_modal: rgb_crop, trans_rgb = self.Crop_Image_deep_pp_RGB(copy.deepcopy(rgb), center_uvd, self.cube_size, (self.img_size, self.img_size), cam_para) if self.split == 'train': mode, off, rot, sc = self.rand_augment(sigma_com=self.aug_para[0], sigma_sc=self.aug_para[1], rot_range=self.aug_para[2]) # 10, 0.1, 180 imgD, _, curLabel, cube, com2D, M, _ = self.augmentCrop(depth_crop, gt3Dcrop, center_uvd, self.cube_size, trans, mode, off, rot, sc, cam_para) if 'RGB' in self.input_modal: imgRGB, _, curLabel_rgb, cube_rgb, com2D_rgb, M_rgb, _ = self.augmentCrop_RGB(rgb_crop, gt3Dcrop, center_uvd, self.cube_size, trans_rgb, mode, off, rot, sc, cam_para) # RGB normalization imgRGB = self.transform(imgRGB.astype(np.float32)) / 255. curLabel = curLabel / (cube[2] / 2.0) else: imgD = self.normalize_img(depth_crop.max(), depth_crop, center_xyz, self.cube_size) if 'RGB' in self.input_modal: imgRGB = self.transform(rgb_crop.astype(np.float32)) / 255. curLabel = gt3Dcrop / (self.cube_size[2] / 2.0) cube = np.array(self.cube_size) com2D = center_uvd M = trans com3D = self.jointImgTo3D(com2D, cam_para) joint_img = transformPoints2D(self.joint3DToImg(curLabel * (cube[0] / 2.0) + com3D, cam_para), M) joint_img[:, 0:2] = joint_img[:, 0:2] / (self.img_size / 2) - 1 joint_img[:, 2] = (joint_img[:, 2] - com3D[2]) / (cube[0] / 2.0) # get pcl pcl = self.getpcl(imgD, com3D, cube, M, cam_para) pcl_index = np.arange(pcl.shape[0]) pcl_num = pcl.shape[0] if pcl_num == 0: pcl_sample = np.zeros([self.sample_num, 3]) else: if pcl_num < self.sample_num: tmp = math.floor(self.sample_num / pcl_num) index_temp = pcl_index.repeat(tmp) pcl_index = np.append(index_temp, np.random.choice(pcl_index, size=divmod(self.sample_num, pcl_num)[1], replace=False)) select = np.random.choice(pcl_index, self.sample_num, replace=False) pcl_sample = pcl[select, :] pcl_sample = torch.from_numpy(pcl_sample).float() data_depth = torch.from_numpy(imgD).float() data_depth = data_depth.unsqueeze(0) if 'RGB' in self.input_modal: data_rgb = imgRGB else: data_rgb = None joint_img = torch.from_numpy(joint_img).float() joint = torch.from_numpy(curLabel).float() center = torch.from_numpy(com3D).float() M = torch.from_numpy(M).float() cube = torch.from_numpy(cube).float() cam_para = torch.from_numpy(np.array(cam_para)).float() return data_rgb, data_depth, pcl_sample, joint, joint_img, center, M, cube, cam_para class HO3D(loader): def __init__(self, data_split, root_dir, dataset_version='v3', img_size=128, center_type='refine', aug_para=[10, 0.2, 180], cube_size=[280, 280, 280], input_modal='RGBD', color_factor=0.2): super(HO3D, self).__init__(root_dir, data_split, img_size, center_type, 'HO3D') self.data_split = data_split self.dataset_version = dataset_version self.root_dir = osp.join(root_dir, 'HO3D_%s' % (dataset_version)) self.annot_path = osp.join(self.root_dir, 'annotations') self.root_joint_idx = 0 self.color_factor = color_factor self.aug_para = aug_para self.cube_size = cube_size self.aug_modes = ['rot', 'com', 'sc', 'none'] # 'rot', 'com', 'sc', 'none' self.flip = 1 if center_type == 'refine': self.center_xyz = np.loadtxt(self.root_dir + '/annotations/%s_refine_center_xyz.txt' % (data_split)) self.dataset_len = 0 self.datalist = self.load_data() print('Dataset len:' + str(self.dataset_len)) self.transform = transforms.ToTensor() self.input_modal = input_modal def load_data(self): db = COCO(osp.join(self.annot_path, "HO3D_{}_data.json".format(self.data_split))) datalist = [] for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_shape = (img['height'], img['width']) if self.data_split == 'train' or self.data_split == 'test' or self.data_split == 'train_all': img_path = osp.join(self.root_dir, img['file_name']) joints_coord_cam = np.array(ann['joints_coord_cam'], dtype=np.float32).reshape([21, 3]) # meter cam_param = {k: np.array(v, dtype=np.float32) for k, v in ann['cam_param'].items()} fx, fy, fu, fv = cam_param['focal'][0], cam_param['focal'][1], cam_param['princpt'][0], \ cam_param['princpt'][1] joints_coord_img = self.joint3DToImg(joints_coord_cam, (fx, fy, fu, fv)) center_2d = self.get_center(joints_coord_img[:, :2], np.ones_like(joints_coord_img[:, 0])) bbox = self.get_bbox(joints_coord_img[:, :2], expansion_factor=1.5) # 从节点获得紧致的bbx bbox = self.process_bbox(bbox, img_shape[1], img_shape[0], expansion_factor=1.0) # 删除无效的 if bbox is None: continue self.dataset_len += 1 mano_pose = np.array(ann['mano_param']['pose'], dtype=np.float32) mano_shape = np.array(ann['mano_param']['shape'], dtype=np.float32) mano_trans = np.array(ann['mano_param']['trans'], dtype=np.float32) data = {"img_path": img_path, "img_shape": img_shape, "joints_coord_cam": joints_coord_cam, "joints_coord_img": joints_coord_img, "center_2d": center_2d, "cam_param": cam_param, "mano_pose": mano_pose, "mano_shape": mano_shape, "mano_trans": mano_trans} else: img_path = osp.join(self.root_dir, img['file_name']) root_joint_cam = np.array(ann['root_joint_cam'], dtype=np.float32) cam_param = {k: np.array(v, dtype=np.float32) for k, v in ann['cam_param'].items()} bbox = np.array(ann['bbox'], dtype=np.float32) center_2d = [bbox[0], bbox[1]] rgb = cv2.imread(img_path.replace('png', 'jpg')) # cv2.imwrite('./debug/' + str(self.dataset_len) + '.png', rgb) data = {"img_path": img_path, "img_shape": img_shape, "root_joint_cam": root_joint_cam, "center_2d": center_2d, "cam_param": cam_param} self.dataset_len += 1 datalist.append(data) return datalist def __len__(self): return self.dataset_len def __getitem__(self, idx): # idx = idx + 8460 data = copy.deepcopy(self.datalist[idx]) img_path, img_shape = data['img_path'], data['img_shape'] if 'RGB' in self.input_modal: # rgb = load_img(img_path.replace('png', 'jpg')) if self.dataset_version == 'v2': rgb = cv2.imread(img_path) else: rgb = cv2.imread(img_path.replace('png', 'jpg')) if not isinstance(rgb, np.ndarray): raise IOError("Fail to read %s" % img_path) depth = self.read_depth_img(img_path.replace('rgb', 'depth')) intrinsics = data['cam_param'] cam_para = (intrinsics['focal'][0], intrinsics['focal'][1], intrinsics['princpt'][0], intrinsics['princpt'][1]) if self.phase == 'train' or self.phase == 'test' or self.phase == 'train_all': joint_xyz = data['joints_coord_cam'].reshape([21, 3])[HO3D2MANO, :] * 1000 joint_uvd = self.joint3DToImg(joint_xyz, cam_para) if self.center_type == 'refine': center_xyz = self.center_xyz[idx] else: center_xyz = joint_xyz.mean(0) gt3Dcrop = joint_xyz - center_xyz else: joint_xyz = np.ones([32, 3]) joint_uvd = np.ones([32, 3]) # mesh_uvd = np.ones([32, 3])# gt3Dcrop = np.ones([32, 3]) if self.center_type == 'refine': center_xyz = self.center_xyz[idx] else: center_xyz = joint_xyz.mean(0) center_uvd = self.joint3DToImg(center_xyz, cam_para) depth_crop, trans = self.Crop_Image_deep_pp(copy.deepcopy(depth), center_uvd, self.cube_size, (self.img_size, self.img_size), cam_para) if 'RGB' in self.input_modal: rgb_crop, trans_rgb = self.Crop_Image_deep_pp_RGB(copy.deepcopy(rgb), center_uvd, self.cube_size, (self.img_size, self.img_size), cam_para) if 'train' in self.phase: mode, off, rot, sc = self.rand_augment(sigma_com=self.aug_para[0], sigma_sc=self.aug_para[1], rot_range=self.aug_para[2]) # 10, 0.1, 180 imgD, _, curLabel, cube, com2D, M, _ = self.augmentCrop(depth_crop, gt3Dcrop, center_uvd, self.cube_size, trans, mode, off, rot, sc, cam_para) if 'RGB' in self.input_modal: imgRGB, _, curLabel_rgb, cube_rgb, com2D_rgb, M_rgb, _ = self.augmentCrop_RGB(rgb_crop, gt3Dcrop, center_uvd, self.cube_size, trans_rgb, mode, off, rot, sc, cam_para) if self.color_factor != 0: # RGB augment c_up = 1.0 + self.color_factor c_low = 1.0 - self.color_factor color_scale = np.array( [random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)]) imgRGB = np.clip(imgRGB * color_scale[None, None, :], 0, 255) # RGB normalization # imgRGB = self.set_rgb_bg(imgRGB,imgD) imgRGB = self.transform(imgRGB.astype(np.float32)) / 255. curLabel = curLabel / (cube[2] / 2.0) com3D = self.jointImgTo3D(com2D, cam_para) mano_pose, mano_shape, mano_trans = data['mano_pose'], data['mano_shape'], data['mano_trans'] mano_para = np.concatenate((mano_pose, mano_shape, mano_trans), axis=0) if mode == 0: rot_aug_mat = np.array([[np.cos(np.deg2rad(rot)), -np.sin(np.deg2rad(rot)), 0], [np.sin(np.deg2rad(rot)), np.cos(np.deg2rad(rot)), 0], [0, 0, 1]], dtype=np.float32) # mesh_uvd = np.dot(rot_aug_mat, (mesh_uvd - com2D).transpose(1, 0)).transpose(1, 0) + com2D# elif self.phase == 'test': imgD = self.normalize_img(depth_crop.max(), depth_crop, center_xyz, self.cube_size) if 'RGB' in self.input_modal: # imgRGB = self.set_rgb_bg(imgRGB, imgD) imgRGB = self.transform(rgb_crop.astype(np.float32)) / 255. curLabel = gt3Dcrop / (self.cube_size[2] / 2.0) cube = np.array(self.cube_size) com2D = center_uvd M = trans com3D = self.jointImgTo3D(com2D, cam_para) mano_pose, mano_shape, mano_trans = data['mano_pose'], data['mano_shape'], data['mano_trans'] mano_para = np.concatenate((mano_pose, mano_shape, mano_trans), axis=0) else: imgD = self.normalize_img(depth_crop.max(), depth_crop, center_xyz, self.cube_size) if 'RGB' in self.input_modal: # imgRGB = self.set_rgb_bg(imgRGB, imgD) imgRGB = self.transform(rgb_crop.astype(np.float32)) / 255. curLabel = gt3Dcrop / (self.cube_size[2] / 2.0) cube = np.array(self.cube_size) com2D = center_uvd M = trans com3D = self.jointImgTo3D(com2D, cam_para) mano_para = np.ones([55]) joint_img = transformPoints2D(self.joint3DToImg(curLabel * (cube[0] / 2.0) + com3D, cam_para), M) joint_img[:, 0:2] = joint_img[:, 0:2] / (self.img_size / 2) - 1 joint_img[:, 2] = (joint_img[:, 2] - com3D[2]) / (cube[0] / 2.0) # get pcl pcl = self.getpcl(imgD, com3D, cube, M, cam_para) pcl_index = np.arange(pcl.shape[0]) pcl_num = pcl.shape[0] if pcl_num == 0: pcl_sample = np.zeros([self.sample_num, 3]) else: if pcl_num < self.sample_num: tmp = math.floor(self.sample_num / pcl_num) index_temp = pcl_index.repeat(tmp) pcl_index = np.append(index_temp, np.random.choice(pcl_index, size=divmod(self.sample_num, pcl_num)[1], replace=False)) select = np.random.choice(pcl_index, self.sample_num, replace=False) pcl_sample = pcl[select, :] pcl_sample = torch.from_numpy(pcl_sample).float() pcl_sample = torch.clamp(pcl_sample, -1, 1) data_depth = torch.from_numpy(imgD).float() data_depth = data_depth.unsqueeze(0) if 'RGB' in self.input_modal: data_rgb = imgRGB else: data_rgb = None joint_img = torch.from_numpy(joint_img).float() joint = torch.from_numpy(curLabel).float() center = torch.from_numpy(com3D).float() M = torch.from_numpy(M).float() cube = torch.from_numpy(cube).float() cam_para = torch.from_numpy(np.array(cam_para)).float() return data_rgb, data_depth, pcl_sample, joint, joint_img, center, M, cube, cam_para def get_center(self, joint_img, joint_valid): x_img, y_img = joint_img[:, 0], joint_img[:, 1] x_img = x_img[joint_valid == 1] y_img = y_img[joint_valid == 1] xmin = min(x_img) ymin = min(y_img) xmax = max(x_img) ymax = max(y_img) x_center = (xmin + xmax) / 2. y_center = (ymin + ymax) / 2. return [x_center, y_center] def get_bbox(self, joint_img, expansion_factor=1.0): x_img, y_img = joint_img[:, 0], joint_img[:, 1] xmin = min(x_img) ymin = min(y_img) xmax = max(x_img) ymax = max(y_img) x_center = (xmin + xmax) / 2. width = (xmax - xmin) * expansion_factor xmin = x_center - 0.5 * width xmax = x_center + 0.5 * width y_center = (ymin + ymax) / 2. height = (ymax - ymin) * expansion_factor ymin = y_center - 0.5 * height ymax = y_center + 0.5 * height bbox = np.array([xmin, ymin, xmax - xmin, ymax - ymin]).astype(np.float32) return bbox def process_bbox(self, bbox, img_width, img_height, expansion_factor=1.25): # sanitize bboxes x, y, w, h = bbox x1 = np.max((0, x)) y1 = np.max((0, y)) x2 = np.min((img_width - 1, x1 + np.max((0, w - 1)))) y2 = np.min((img_height - 1, y1 + np.max((0, h - 1)))) if w * h > 0 and x2 >= x1 and y2 >= y1: bbox = np.array([x1, y1, x2 - x1, y2 - y1]) else: return None # aspect ratio preserving bbox w = bbox[2] h = bbox[3] c_x = bbox[0] + w / 2. c_y = bbox[1] + h / 2. aspect_ratio = 1 if w > aspect_ratio * h: h = w / aspect_ratio elif w < aspect_ratio * h: w = h * aspect_ratio bbox[2] = w * expansion_factor bbox[3] = h * expansion_factor bbox[0] = c_x - bbox[2] / 2. bbox[1] = c_y - bbox[3] / 2. return bbox def read_depth_img(self, depth_filename): """Read the depth image in dataset and decode it""" depth_scale = 0.00012498664727900177 depth_img = cv2.imread(depth_filename) dpt = depth_img[:, :, 2] + depth_img[:, :, 1] * 256 dpt = dpt * depth_scale * 1000 return dpt def read_seg_img(self, filename): """Read the depth image in dataset and decode it""" seg_img = cv2.imread(filename) h, w, c = seg_img.shape seg_img = seg_img.reshape([-1, 3]) seg_label = np.zeros([h, w]).reshape([-1]) seg_label[seg_img[:, 0] == 255] = 1 seg_label[seg_img[:, 1] == 255] = 2 seg_label = seg_label.reshape([h, w]) seg_label = cv2.resize(seg_label, [640, 480], interpolation=cv2.INTER_NEAREST) return seg_label class nyu_loader(loader): def __init__(self, root_dir, phase, aug_para=[10, 0.1, 180], img_size=128, cube_size=[250, 250, 250], center_type='refine', joint_num=23, loader=nyu_reader, color_factor=0): super(nyu_loader, self).__init__(root_dir, phase, img_size, center_type, 'nyu') self.paras = (588.03, 587.07, 320., 240.) self.cube_size = np.array(cube_size) self.allJoints = True self.flip = 1 self.color_factor = color_factor self.croppedSz = img_size self.joint_num = joint_num self.aug_modes = ['rot', 'com', 'sc', 'none'] # 'rot','com','sc','none' self.aug_para = aug_para self.transform = transforms.ToTensor() data_path = '{}/{}'.format(self.root_dir, self.phase) label_path = '{}/joint_data.mat'.format(data_path) print('loading data...') self.labels = sio.loadmat(label_path) self.data_path = data_path self.all_joints_uvd = self.labels['joint_uvd'][0][:, joint_select, :][:, calculate, :] self.all_joints_xyz = self.labels['joint_xyz'][0][:, joint_select, :][:, calculate, :] self.all_joints_xyz = self.jointImgTo3D(self.joint3DToImg(self.all_joints_xyz, flip=-1)) print('finish!!') if center_type == 'refine': center_path = '{}/center_{}_refined.txt'.format(data_path, self.phase) self.center_xyz = np.loadtxt(center_path) self.center_xyz = self.jointImgTo3D(self.joint3DToImg(self.center_xyz, flip=-1)) elif center_type == 'joint': self.center_xyz = self.all_joints_xyz[:, 20, :] elif center_type == 'joint_mean': self.center_xyz = self.all_joints_xyz.mean(1) elif center_type == 'random': self.center_xyz = self.all_joints_xyz.mean(1) self.loader = loader self.test_cubesize = np.ones([8252, 3]) * self.cube_size self.test_cubesize[2440:, :] = self.test_cubesize[2440:, :] * 5.0 / 6.0 self.all_model_para = self.read_modelPara(root_dir, 0) self.joint_kernel = torch.Tensor([0.15, 0.15, 0.15, 0.25, 0.15, 0.15, 0.15, 0.25, 0.15, 0.15, 0.15, 0.25, 0.15, 0.15, 0.15, 0.25, 0.25, 0.25, 0.25, 0.35, 0.4, 0.4, 0.4]) * 1.1 def __getitem__(self, index): img_path = self.data_path + '/depth_1_{:07d}.png'.format(index + 1) if not os.path.exists(img_path): print(img_path) depth = self.loader(img_path) RGB = cv2.imread(img_path.replace("depth", "rgb")) if not isinstance(RGB, np.ndarray): print("RGB read fail") RGB = np.zeros(shape=(480, 640, 3), dtype=np.uint8) joint_xyz = self.all_joints_xyz[index].copy() if self.phase == 'test': cube_size = self.test_cubesize[index] else: cube_size = self.cube_size center_xyz = self.center_xyz[index] center_uvd = self.joint3DToImg(center_xyz) gt3Dcrop = joint_xyz - center_xyz.reshape(1, 3) depth_crop, trans = self.Crop_Image_deep_pp(depth, center_uvd, cube_size, (self.img_size, self.img_size), self.paras) rgb_crop, trans_RGB = self.Crop_Image_deep_pp_RGB(RGB, center_uvd, cube_size, (self.img_size, self.img_size), self.paras) if self.phase == 'train': mode, off, rot, sc = self.rand_augment(sigma_com=self.aug_para[0], sigma_sc=self.aug_para[1], rot_range=self.aug_para[2]) imgD, _, curLabel, cube, com2D, M, _ = self.augmentCrop(depth_crop, gt3Dcrop, center_uvd, self.cube_size, trans, mode, off, rot, sc, self.paras) imgRGB, _, _, _, _, _, _ = self.augmentCrop_RGB(rgb_crop, gt3Dcrop, center_uvd, self.cube_size, trans_RGB, mode, off, rot, sc, self.paras) if self.color_factor != 0: # RGB augment c_up = 1.0 + self.color_factor c_low = 1.0 - self.color_factor color_scale = np.array( [random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)]) imgRGB = np.clip(imgRGB * color_scale[None, None, :], 0, 255) # RGB normalization imgRGB = self.transform(imgRGB.astype(np.float32)) / 255. curLabel = curLabel / (cube[2] / 2.0) else: imgD = self.normalize_img(depth_crop.max(), depth_crop, center_xyz, cube_size) imgRGB = self.transform(rgb_crop.astype(np.float32)) / 255. curLabel = gt3Dcrop / (cube_size[2] / 2.0) cube = np.array(cube_size) com2D = center_uvd M = trans com3D = self.jointImgTo3D(com2D) joint_img = transformPoints2D(self.joint3DToImg(curLabel * (cube[0] / 2.0) + com3D), M) joint_img[:, 0:2] = joint_img[:, 0:2] / (self.img_size / 2) - 1 joint_img[:, 2] = (joint_img[:, 2] - com3D[2]) / (cube[0] / 2.0) # get pcl pcl = self.getpcl(imgD, com3D, cube, M) pcl_index = np.arange(pcl.shape[0]) pcl_num = pcl.shape[0] if pcl_num == 0: pcl_sample = np.zeros([self.sample_num, 3]) else: if pcl_num < self.sample_num: tmp = math.floor(self.sample_num / pcl_num) index_temp = pcl_index.repeat(tmp) pcl_index = np.append(index_temp, np.random.choice(pcl_index, size=divmod(self.sample_num, pcl_num)[1], replace=False)) select = np.random.choice(pcl_index, self.sample_num, replace=False) pcl_sample = pcl[select, :] data = torch.from_numpy(imgD).float() data = data.unsqueeze(0) data_rgb = imgRGB pcl_sample = torch.from_numpy(pcl_sample).float() joint_img = torch.from_numpy(joint_img).float() joint = torch.from_numpy(curLabel).float() center = torch.from_numpy(com3D).float() M = torch.from_numpy(M).float() cube = torch.from_numpy(cube).float() cam_para = torch.from_numpy(np.array(self.paras)).float() return data_rgb, data, pcl_sample, joint, joint_img, center, M, cube, cam_para def __len__(self): return len(self.all_joints_xyz) def xyz2error(output, joint, center, cube_size): output = output.detach().cpu().numpy() joint = joint.detach().cpu().numpy() center = center.detach().cpu().numpy() cube_size = cube_size.detach().cpu().numpy() batchsize, joint_num, _ = output.shape center = np.tile(center.reshape(batchsize, 1, -1), [1, joint_num, 1]) cube_size = np.tile(cube_size.reshape(batchsize, 1, -1), [1, joint_num, 1]) joint_xyz = output * cube_size / 2 + center joint_world_select = joint * cube_size / 2 + center errors = (joint_xyz - joint_world_select) * (joint_xyz - joint_world_select) errors = np.sqrt(np.sum(errors, axis=2)) return errors def Matr(theta): B = theta.size(0) sin_theta = torch.sin(theta) cos_theta = torch.cos(theta) zero_theta = torch.zeros_like(theta) one_theta = torch.ones_like(theta) row_1 = torch.stack([cos_theta, -sin_theta, zero_theta], dim=-1).view(B, 1, 3) row_2 = torch.stack([sin_theta, cos_theta, zero_theta], dim=-1).view(B, 1, 3) row_3 = torch.stack([zero_theta, zero_theta, one_theta], dim=-1).view(B, 1, 3) return torch.cat([row_1, row_2, row_3], dim=1) def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed)