File size: 9,856 Bytes
34d1f8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# Copyright (c) OpenMMLab. All rights reserved.
import copy
from typing import Optional
import mmcv
import numpy as np
from mmengine.fileio import get
from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles
from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module()
class BEVLoadMultiViewImageFromFiles(LoadMultiViewImageFromFiles):
"""Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the
convenience of view transforms in the forward:
- 'cam2lidar'
- 'lidar2img'
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
backend_args (dict, optional): Arguments to instantiate the
corresponding backend. Defaults to None.
num_views (int): Number of view in a frame. Defaults to 5.
num_ref_frames (int): Number of frame in loading. Defaults to -1.
test_mode (bool): Whether is test mode in loading. Defaults to False.
set_default_scale (bool): Whether to set default scale.
Defaults to True.
"""
def transform(self, results: dict) -> Optional[dict]:
"""Call function to load multi-view image from files.
Args:
results (dict): Result dict containing multi-view image filenames.
Returns:
dict: The result dict containing the multi-view image data.
Added keys and values are described below.
- filename (str): Multi-view image filenames.
- img (np.ndarray): Multi-view image arrays.
- img_shape (tuple[int]): Shape of multi-view image arrays.
- ori_shape (tuple[int]): Shape of original image arrays.
- pad_shape (tuple[int]): Shape of padded image arrays.
- scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images.
"""
# TODO: consider split the multi-sweep part out of this pipeline
# Derive the mask and transform for loading of multi-sweep data
if self.num_ref_frames > 0:
# init choice with the current frame
init_choice = np.array([0], dtype=np.int64)
num_frames = len(results['img_filename']) // self.num_views - 1
if num_frames == 0: # no previous frame, then copy cur frames
choices = np.random.choice(
1, self.num_ref_frames, replace=True)
elif num_frames >= self.num_ref_frames:
# NOTE: suppose the info is saved following the order
# from latest to earlier frames
if self.test_mode:
choices = np.arange(num_frames - self.num_ref_frames,
num_frames) + 1
# NOTE: +1 is for selecting previous frames
else:
choices = np.random.choice(
num_frames, self.num_ref_frames, replace=False) + 1
elif num_frames > 0 and num_frames < self.num_ref_frames:
if self.test_mode:
base_choices = np.arange(num_frames) + 1
random_choices = np.random.choice(
num_frames,
self.num_ref_frames - num_frames,
replace=True) + 1
choices = np.concatenate([base_choices, random_choices])
else:
choices = np.random.choice(
num_frames, self.num_ref_frames, replace=True) + 1
else:
raise NotImplementedError
choices = np.concatenate([init_choice, choices])
select_filename = []
for choice in choices:
select_filename += results['img_filename'][choice *
self.num_views:
(choice + 1) *
self.num_views]
results['img_filename'] = select_filename
for key in ['cam2img', 'lidar2cam']:
if key in results:
select_results = []
for choice in choices:
select_results += results[key][choice *
self.num_views:(choice +
1) *
self.num_views]
results[key] = select_results
for key in ['ego2global']:
if key in results:
select_results = []
for choice in choices:
select_results += [results[key][choice]]
results[key] = select_results
# Transform lidar2cam to
# [cur_lidar]2[prev_img] and [cur_lidar]2[prev_cam]
for key in ['lidar2cam']:
if key in results:
# only change matrices of previous frames
for choice_idx in range(1, len(choices)):
pad_prev_ego2global = np.eye(4)
prev_ego2global = results['ego2global'][choice_idx]
pad_prev_ego2global[:prev_ego2global.
shape[0], :prev_ego2global.
shape[1]] = prev_ego2global
pad_cur_ego2global = np.eye(4)
cur_ego2global = results['ego2global'][0]
pad_cur_ego2global[:cur_ego2global.
shape[0], :cur_ego2global.
shape[1]] = cur_ego2global
cur2prev = np.linalg.inv(pad_prev_ego2global).dot(
pad_cur_ego2global)
for result_idx in range(choice_idx * self.num_views,
(choice_idx + 1) *
self.num_views):
results[key][result_idx] = \
results[key][result_idx].dot(cur2prev)
# Support multi-view images with different shapes
# TODO: record the origin shape and padded shape
filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], []
for _, cam_item in results['images'].items():
filename.append(cam_item['img_path'])
lidar2cam.append(cam_item['lidar2cam'])
lidar2cam_array = np.array(cam_item['lidar2cam']).astype(
np.float32)
lidar2cam_rot = lidar2cam_array[:3, :3]
lidar2cam_trans = lidar2cam_array[:3, 3:4]
camera2lidar = np.eye(4)
camera2lidar[:3, :3] = lidar2cam_rot.T
camera2lidar[:3, 3:4] = -1 * np.matmul(
lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1))
cam2lidar.append(camera2lidar)
cam2img_array = np.eye(4).astype(np.float32)
cam2img_array[:3, :3] = np.array(cam_item['cam2img']).astype(
np.float32)
cam2img.append(cam2img_array)
lidar2img.append(cam2img_array @ lidar2cam_array)
results['img_path'] = filename
results['cam2img'] = np.stack(cam2img, axis=0)
results['lidar2cam'] = np.stack(lidar2cam, axis=0)
results['cam2lidar'] = np.stack(cam2lidar, axis=0)
results['lidar2img'] = np.stack(lidar2img, axis=0)
results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
# img is of shape (h, w, c, num_views)
# h and w can be different for different views
img_bytes = [
get(name, backend_args=self.backend_args) for name in filename
]
imgs = [
mmcv.imfrombytes(
img_byte,
flag=self.color_type,
backend='pillow',
channel_order='rgb') for img_byte in img_bytes
]
# handle the image with different shape
img_shapes = np.stack([img.shape for img in imgs], axis=0)
img_shape_max = np.max(img_shapes, axis=0)
img_shape_min = np.min(img_shapes, axis=0)
assert img_shape_min[-1] == img_shape_max[-1]
if not np.all(img_shape_max == img_shape_min):
pad_shape = img_shape_max[:2]
else:
pad_shape = None
if pad_shape is not None:
imgs = [
mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
]
img = np.stack(imgs, axis=-1)
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = filename
# unravel to list, see `DefaultFormatBundle` in formating.py
# which will transpose each image separately and then stack into array
results['img'] = [img[..., i] for i in range(img.shape[-1])]
results['img_shape'] = img.shape[:2]
results['ori_shape'] = img.shape[:2]
# Set initial values for default meta_keys
results['pad_shape'] = img.shape[:2]
if self.set_default_scale:
results['scale_factor'] = 1.0
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results['img_norm_cfg'] = dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False)
results['num_views'] = self.num_views
results['num_ref_frames'] = self.num_ref_frames
return results
|