Spaces:
Running
on
T4
Running
on
T4
File size: 17,586 Bytes
8a32844 d934218 8a32844 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 |
import collections
import cv2
import math
import numpy as np
import numbers
import random
import torch
import matplotlib
import matplotlib.cm
"""
Provides a set of Pytorch transforms that use OpenCV instead of PIL (Pytorch default)
for image manipulation.
"""
class Compose(object):
# Composes transforms: transforms.Compose([transforms.RandScale([0.5, 2.0]), transforms.ToTensor()])
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
for t in self.transforms:
images, labels, intrinsics, cam_models, other_labels, transform_paras = t(images, labels, intrinsics, cam_models, other_labels, transform_paras)
return images, labels, intrinsics, cam_models, other_labels, transform_paras
class ToTensor(object):
# Converts numpy.ndarray (H x W x C) to a torch.FloatTensor of shape (C x H x W).
def __init__(self, **kwargs):
return
def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
if not isinstance(images, list) or not isinstance(labels, list) or not isinstance(intrinsics, list):
raise (RuntimeError("transform.ToTensor() only handle inputs/labels/intrinsics lists."))
if len(images) != len(intrinsics):
raise (RuntimeError("Numbers of images and intrinsics are not matched."))
if not isinstance(images[0], np.ndarray) or not isinstance(labels[0], np.ndarray):
raise (RuntimeError("transform.ToTensor() only handle np.ndarray for the input and label."
"[eg: data readed by cv2.imread()].\n"))
if not isinstance(intrinsics[0], list):
raise (RuntimeError("transform.ToTensor() only handle list for the camera intrinsics"))
if len(images[0].shape) > 3 or len(images[0].shape) < 2:
raise (RuntimeError("transform.ToTensor() only handle image(np.ndarray) with 3 dims or 2 dims.\n"))
if len(labels[0].shape) > 3 or len(labels[0].shape) < 2:
raise (RuntimeError("transform.ToTensor() only handle label(np.ndarray) with 3 dims or 2 dims.\n"))
if len(intrinsics[0]) >4 or len(intrinsics[0]) < 3:
raise (RuntimeError("transform.ToTensor() only handle intrinsic(list) with 3 sizes or 4 sizes.\n"))
for i, img in enumerate(images):
if len(img.shape) == 2:
img = np.expand_dims(img, axis=2)
images[i] = torch.from_numpy(img.transpose((2, 0, 1))).float()
for i, lab in enumerate(labels):
if len(lab.shape) == 2:
lab = np.expand_dims(lab, axis=0)
labels[i] = torch.from_numpy(lab).float()
for i, intrinsic in enumerate(intrinsics):
if len(intrinsic) == 3:
intrinsic = [intrinsic[0],] + intrinsic
intrinsics[i] = torch.tensor(intrinsic, dtype=torch.float)
if cam_models is not None:
for i, cam_model in enumerate(cam_models):
cam_models[i] = torch.from_numpy(cam_model.transpose((2, 0, 1))).float() if cam_model is not None else None
if other_labels is not None:
for i, lab in enumerate(other_labels):
if len(lab.shape) == 2:
lab = np.expand_dims(lab, axis=0)
other_labels[i] = torch.from_numpy(lab).float()
return images, labels, intrinsics, cam_models, other_labels, transform_paras
class Normalize(object):
# Normalize tensor with mean and standard deviation along channel: channel = (channel - mean) / std
def __init__(self, mean, std=None, **kwargs):
if std is None:
assert len(mean) > 0
else:
assert len(mean) == len(std)
self.mean = torch.tensor(mean).float()[:, None, None]
self.std = torch.tensor(std).float()[:, None, None] if std is not None \
else torch.tensor([1.0, 1.0, 1.0]).float()[:, None, None]
def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
# if self.std is None:
# # for t, m in zip(image, self.mean):
# # t.sub(m)
# image = image - self.mean
# if ref_images is not None:
# for i, ref_i in enumerate(ref_images):
# ref_images[i] = ref_i - self.mean
# else:
# # for t, m, s in zip(image, self.mean, self.std):
# # t.sub(m).div(s)
# image = (image - self.mean) / self.std
# if ref_images is not None:
# for i, ref_i in enumerate(ref_images):
# ref_images[i] = (ref_i - self.mean) / self.std
for i, img in enumerate(images):
img = torch.div((img - self.mean), self.std)
images[i] = img
return images, labels, intrinsics, cam_models, other_labels, transform_paras
class LableScaleCanonical(object):
"""
To solve the ambiguity observation for the mono branch, i.e. different focal length (object size) with the same depth, cameras are
mapped to a canonical space. To mimic this, we set the focal length to a canonical one and scale the depth value. NOTE: resize the image based on the ratio can also solve
Args:
images: list of RGB images.
labels: list of depth/disparity labels.
other labels: other labels, such as instance segmentations, semantic segmentations...
"""
def __init__(self, **kwargs):
self.canonical_focal = kwargs['focal_length']
def _get_scale_ratio(self, intrinsic):
target_focal_x = intrinsic[0]
label_scale_ratio = self.canonical_focal / target_focal_x
pose_scale_ratio = 1.0
return label_scale_ratio, pose_scale_ratio
def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
assert len(images[0].shape) == 3 and len(labels[0].shape) == 2
assert labels[0].dtype == np.float32
label_scale_ratio = None
pose_scale_ratio = None
for i in range(len(intrinsics)):
img_i = images[i]
label_i = labels[i] if i < len(labels) else None
intrinsic_i = intrinsics[i].copy()
cam_model_i = cam_models[i] if cam_models is not None and i < len(cam_models) else None
label_scale_ratio, pose_scale_ratio = self._get_scale_ratio(intrinsic_i)
# adjust the focal length, map the current camera to the canonical space
intrinsics[i] = [intrinsic_i[0] * label_scale_ratio, intrinsic_i[1] * label_scale_ratio, intrinsic_i[2], intrinsic_i[3]]
# scale the label to the canonical space
if label_i is not None:
labels[i] = label_i * label_scale_ratio
if cam_model_i is not None:
# As the focal length is adjusted (canonical focal length), the camera model should be re-built
ori_h, ori_w, _ = img_i.shape
cam_models[i] = build_camera_model(ori_h, ori_w, intrinsics[i])
if transform_paras is not None:
transform_paras.update(label_scale_factor=label_scale_ratio, focal_scale_factor=label_scale_ratio)
return images, labels, intrinsics, cam_models, other_labels, transform_paras
class ResizeKeepRatio(object):
"""
Resize and pad to a given size. Hold the aspect ratio.
This resizing assumes that the camera model remains unchanged.
Args:
resize_size: predefined output size.
"""
def __init__(self, resize_size, padding=None, ignore_label=-1, **kwargs):
if isinstance(resize_size, int):
self.resize_h = resize_size
self.resize_w = resize_size
elif isinstance(resize_size, collections.Iterable) and len(resize_size) == 2 \
and isinstance(resize_size[0], int) and isinstance(resize_size[1], int) \
and resize_size[0] > 0 and resize_size[1] > 0:
self.resize_h = resize_size[0]
self.resize_w = resize_size[1]
else:
raise (RuntimeError("crop size error.\n"))
if padding is None:
self.padding = padding
elif isinstance(padding, list):
if all(isinstance(i, numbers.Number) for i in padding):
self.padding = padding
else:
raise (RuntimeError("padding in Crop() should be a number list\n"))
if len(padding) != 3:
raise (RuntimeError("padding channel is not equal with 3\n"))
else:
raise (RuntimeError("padding in Crop() should be a number list\n"))
if isinstance(ignore_label, int):
self.ignore_label = ignore_label
else:
raise (RuntimeError("ignore_label should be an integer number\n"))
# self.crop_size = kwargs['crop_size']
self.canonical_focal = kwargs['focal_length']
def main_data_transform(self, image, label, intrinsic, cam_model, resize_ratio, padding, to_scale_ratio):
"""
Resize data first and then do the padding.
'label' will be scaled.
"""
h, w, _ = image.shape
reshape_h = int(resize_ratio * h)
reshape_w = int(resize_ratio * w)
pad_h, pad_w, pad_h_half, pad_w_half = padding
# resize
image = cv2.resize(image, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_LINEAR)
# padding
image = cv2.copyMakeBorder(
image,
pad_h_half,
pad_h - pad_h_half,
pad_w_half,
pad_w - pad_w_half,
cv2.BORDER_CONSTANT,
value=self.padding)
if label is not None:
# label = cv2.resize(label, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_NEAREST)
label = resize_depth_preserve(label, (reshape_h, reshape_w))
label = cv2.copyMakeBorder(
label,
pad_h_half,
pad_h - pad_h_half,
pad_w_half,
pad_w - pad_w_half,
cv2.BORDER_CONSTANT,
value=self.ignore_label)
# scale the label
label = label / to_scale_ratio
# Resize, adjust principle point
if intrinsic is not None:
intrinsic[0] = intrinsic[0] * resize_ratio / to_scale_ratio
intrinsic[1] = intrinsic[1] * resize_ratio / to_scale_ratio
intrinsic[2] = intrinsic[2] * resize_ratio
intrinsic[3] = intrinsic[3] * resize_ratio
if cam_model is not None:
#cam_model = cv2.resize(cam_model, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_LINEAR)
cam_model = build_camera_model(reshape_h, reshape_w, intrinsic)
cam_model = cv2.copyMakeBorder(
cam_model,
pad_h_half,
pad_h - pad_h_half,
pad_w_half,
pad_w - pad_w_half,
cv2.BORDER_CONSTANT,
value=self.ignore_label)
# Pad, adjust the principle point
if intrinsic is not None:
intrinsic[2] = intrinsic[2] + pad_w_half
intrinsic[3] = intrinsic[3] + pad_h_half
return image, label, intrinsic, cam_model
def get_label_scale_factor(self, image, intrinsic, resize_ratio):
ori_h, ori_w, _ = image.shape
# crop_h, crop_w = self.crop_size
ori_focal = intrinsic[0]
to_canonical_ratio = self.canonical_focal / ori_focal
to_scale_ratio = resize_ratio / to_canonical_ratio
return to_scale_ratio
def __call__(self, images, labels, intrinsics, cam_models=None, other_labels=None, transform_paras=None):
target_h, target_w, _ = images[0].shape
resize_ratio_h = self.resize_h / target_h
resize_ratio_w = self.resize_w / target_w
resize_ratio = min(resize_ratio_h, resize_ratio_w)
reshape_h = int(resize_ratio * target_h)
reshape_w = int(resize_ratio * target_w)
pad_h = max(self.resize_h - reshape_h, 0)
pad_w = max(self.resize_w - reshape_w, 0)
pad_h_half = int(pad_h / 2)
pad_w_half = int(pad_w / 2)
pad_info = [pad_h, pad_w, pad_h_half, pad_w_half]
to_scale_ratio = self.get_label_scale_factor(images[0], intrinsics[0], resize_ratio)
for i in range(len(images)):
img = images[i]
label = labels[i] if i < len(labels) else None
intrinsic = intrinsics[i] if i < len(intrinsics) else None
cam_model = cam_models[i] if cam_models is not None and i < len(cam_models) else None
img, label, intrinsic, cam_model = self.main_data_transform(
img, label, intrinsic, cam_model, resize_ratio, pad_info, to_scale_ratio)
images[i] = img
if label is not None:
labels[i] = label
if intrinsic is not None:
intrinsics[i] = intrinsic
if cam_model is not None:
cam_models[i] = cam_model
if other_labels is not None:
for i, other_lab in enumerate(other_labels):
# resize
other_lab = cv2.resize(other_lab, dsize=(reshape_w, reshape_h), interpolation=cv2.INTER_NEAREST)
# pad
other_labels[i] = cv2.copyMakeBorder(
other_lab,
pad_h_half,
pad_h - pad_h_half,
pad_w_half,
pad_w - pad_w_half,
cv2.BORDER_CONSTANT,
value=self.ignore_label)
pad = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
if transform_paras is not None:
pad_old = transform_paras['pad'] if 'pad' in transform_paras else [0,0,0,0]
new_pad = [pad_old[0] + pad[0], pad_old[1] + pad[1], pad_old[2] + pad[2], pad_old[3] + pad[3]]
transform_paras.update(dict(pad=new_pad))
if 'label_scale_factor' in transform_paras:
transform_paras['label_scale_factor'] = transform_paras['label_scale_factor'] * 1.0 / to_scale_ratio
else:
transform_paras.update(label_scale_factor=1.0/to_scale_ratio)
return images, labels, intrinsics, cam_models, other_labels, transform_paras
class BGR2RGB(object):
# Converts image from BGR order to RGB order, for model initialized from Pytorch
def __init__(self, **kwargs):
return
def __call__(self, images, labels, intrinsics, cam_models=None,other_labels=None, transform_paras=None):
for i, img in enumerate(images):
images[i] = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return images, labels, intrinsics, cam_models, other_labels, transform_paras
def resize_depth_preserve(depth, shape):
"""
Resizes depth map preserving all valid depth pixels
Multiple downsampled points can be assigned to the same pixel.
Parameters
----------
depth : np.array [h,w]
Depth map
shape : tuple (H,W)
Output shape
Returns
-------
depth : np.array [H,W,1]
Resized depth map
"""
# Store dimensions and reshapes to single column
depth = np.squeeze(depth)
h, w = depth.shape
x = depth.reshape(-1)
# Create coordinate grid
uv = np.mgrid[:h, :w].transpose(1, 2, 0).reshape(-1, 2)
# Filters valid points
idx = x > 0
crd, val = uv[idx], x[idx]
# Downsamples coordinates
crd[:, 0] = (crd[:, 0] * (shape[0] / h) + 0.5).astype(np.int32)
crd[:, 1] = (crd[:, 1] * (shape[1] / w) + 0.5).astype(np.int32)
# Filters points inside image
idx = (crd[:, 0] < shape[0]) & (crd[:, 1] < shape[1])
crd, val = crd[idx], val[idx]
# Creates downsampled depth image and assigns points
depth = np.zeros(shape)
depth[crd[:, 0], crd[:, 1]] = val
# Return resized depth map
return depth
def build_camera_model(H : int, W : int, intrinsics : list) -> np.array:
"""
Encode the camera intrinsic parameters (focal length and principle point) to a 4-channel map.
"""
fx, fy, u0, v0 = intrinsics
f = (fx + fy) / 2.0
# principle point location
x_row = np.arange(0, W).astype(np.float32)
x_row_center_norm = (x_row - u0) / W
x_center = np.tile(x_row_center_norm, (H, 1)) # [H, W]
y_col = np.arange(0, H).astype(np.float32)
y_col_center_norm = (y_col - v0) / H
y_center = np.tile(y_col_center_norm, (W, 1)).T
# FoV
fov_x = np.arctan(x_center / (f / W))
fov_y = np.arctan(y_center/ (f / H))
cam_model = np.stack([x_center, y_center, fov_x, fov_y], axis=2)
return cam_model
def gray_to_colormap(img, cmap='rainbow'):
"""
Transfer gray map to matplotlib colormap
"""
assert img.ndim == 2
img[img<0] = 0
mask_invalid = img < 1e-10
#img = img / (img.max() + 1e-8)
img_ = img.flatten()
max_value = np.percentile(img_, q=98)
img = img / (max_value + 1e-8)
norm = matplotlib.colors.Normalize(vmin=0, vmax=1.1)
cmap_m = matplotlib.cm.get_cmap(cmap)
map = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap_m)
colormap = (map.to_rgba(img)[:, :, :3] * 255).astype(np.uint8)
colormap[mask_invalid] = 0
return colormap |