yaghi27 commited on
Commit
bbbed36
·
1 Parent(s): ce3dd67

Upload 3 files

Browse files
model/DETR3D/detr3d.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional
2
+
3
+ import torch
4
+ from torch import Tensor
5
+
6
+ from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
7
+ from mmdet3d.registry import MODELS
8
+ from mmdet3d.structures import Det3DDataSample
9
+ from mmdet3d.structures.bbox_3d.utils import get_lidar2img
10
+ from .grid_mask import GridMask
11
+
12
+
13
+ @MODELS.register_module()
14
+ class DETR3D(MVXTwoStageDetector):
15
+ """DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries
16
+
17
+ Args:
18
+ data_preprocessor (dict or ConfigDict, optional): The pre-process
19
+ config of :class:`Det3DDataPreprocessor`. Defaults to None.
20
+ use_grid_mask (bool) : Data augmentation. Whether to mask out some
21
+ grids during extract_img_feat. Defaults to False.
22
+ img_backbone (dict, optional): Backbone of extracting
23
+ images feature. Defaults to None.
24
+ img_neck (dict, optional): Neck of extracting
25
+ image features. Defaults to None.
26
+ pts_bbox_head (dict, optional): Bboxes head of
27
+ detr3d. Defaults to None.
28
+ train_cfg (dict, optional): Train config of model.
29
+ Defaults to None.
30
+ test_cfg (dict, optional): Train config of model.
31
+ Defaults to None.
32
+ init_cfg (dict, optional): Initialize config of
33
+ model. Defaults to None.
34
+ """
35
+
36
+ def __init__(self,
37
+ data_preprocessor=None,
38
+ use_grid_mask=False,
39
+ img_backbone=None,
40
+ img_neck=None,
41
+ pts_bbox_head=None,
42
+ train_cfg=None,
43
+ test_cfg=None,
44
+ pretrained=None):
45
+ super(DETR3D, self).__init__(
46
+ img_backbone=img_backbone,
47
+ img_neck=img_neck,
48
+ pts_bbox_head=pts_bbox_head,
49
+ train_cfg=train_cfg,
50
+ test_cfg=test_cfg,
51
+ data_preprocessor=data_preprocessor)
52
+ self.grid_mask = GridMask(
53
+ True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
54
+ self.use_grid_mask = use_grid_mask
55
+
56
+ def extract_img_feat(self, img: Tensor,
57
+ batch_input_metas: List[dict]) -> List[Tensor]:
58
+ """Extract features from images.
59
+
60
+ Args:
61
+ img (tensor): Batched multi-view image tensor with
62
+ shape (B, N, C, H, W).
63
+ batch_input_metas (list[dict]): Meta information of multiple inputs
64
+ in a batch.
65
+
66
+ Returns:
67
+ list[tensor]: multi-level image features.
68
+ """
69
+
70
+ B = img.size(0)
71
+ if img is not None:
72
+ input_shape = img.shape[-2:] # bs nchw
73
+ # update real input shape of each single img
74
+ for img_meta in batch_input_metas:
75
+ img_meta.update(input_shape=input_shape)
76
+
77
+ if img.dim() == 5 and img.size(0) == 1:
78
+ img.squeeze_()
79
+ elif img.dim() == 5 and img.size(0) > 1:
80
+ B, N, C, H, W = img.size()
81
+ img = img.view(B * N, C, H, W)
82
+ if self.use_grid_mask:
83
+ img = self.grid_mask(img) # mask out some grids
84
+ img_feats = self.img_backbone(img)
85
+ if isinstance(img_feats, dict):
86
+ img_feats = list(img_feats.values())
87
+ else:
88
+ return None
89
+ if self.with_img_neck:
90
+ img_feats = self.img_neck(img_feats)
91
+
92
+ img_feats_reshaped = []
93
+ for img_feat in img_feats:
94
+ BN, C, H, W = img_feat.size()
95
+ img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
96
+ return img_feats_reshaped
97
+
98
+ def extract_feat(self, batch_inputs_dict: Dict,
99
+ batch_input_metas: List[dict]) -> List[Tensor]:
100
+ """Extract features from images.
101
+
102
+ Refer to self.extract_img_feat()
103
+ """
104
+ imgs = batch_inputs_dict.get('imgs', None)
105
+ img_feats = self.extract_img_feat(imgs, batch_input_metas)
106
+ return img_feats
107
+
108
+ def _forward(self):
109
+ raise NotImplementedError('tensor mode is yet to add')
110
+
111
+ # original forward_train
112
+ def loss(self, batch_inputs_dict: Dict[List, Tensor],
113
+ batch_data_samples: List[Det3DDataSample],
114
+ **kwargs) -> List[Det3DDataSample]:
115
+ """
116
+ Args:
117
+ batch_inputs_dict (dict): The model input dict which include
118
+ `imgs` keys.
119
+ - imgs (torch.Tensor): Tensor of batched multi-view images.
120
+ It has shape (B, N, C, H ,W)
121
+ batch_data_samples (List[obj:`Det3DDataSample`]): The Data Samples
122
+ It usually includes information such as `gt_instance_3d`.
123
+
124
+ Returns:
125
+ dict[str, Tensor]: A dictionary of loss components.
126
+
127
+ """
128
+ batch_input_metas = [item.metainfo for item in batch_data_samples]
129
+ batch_input_metas = self.add_lidar2img(batch_input_metas)
130
+ img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
131
+ outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs)
132
+
133
+ batch_gt_instances_3d = [
134
+ item.gt_instances_3d for item in batch_data_samples
135
+ ]
136
+ loss_inputs = [batch_gt_instances_3d, outs]
137
+ losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs)
138
+
139
+ return losses_pts
140
+
141
+ # original simple_test
142
+ def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
143
+ batch_data_samples: List[Det3DDataSample],
144
+ **kwargs) -> List[Det3DDataSample]:
145
+ """Forward of testing.
146
+
147
+ Args:
148
+ batch_inputs_dict (dict): The model input dict which include
149
+ `imgs` keys.
150
+
151
+ - imgs (torch.Tensor): Tensor of batched multi-view images.
152
+ It has shape (B, N, C, H ,W)
153
+ batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
154
+ Samples. It usually includes information such as
155
+ `gt_instance_3d`.
156
+
157
+ Returns:
158
+ list[:obj:`Det3DDataSample`]: Detection results of the
159
+ input sample. Each Det3DDataSample usually contain
160
+ 'pred_instances_3d'. And the ``pred_instances_3d`` usually
161
+ contains following keys.
162
+
163
+ - scores_3d (Tensor): Classification scores, has a shape
164
+ (num_instances, )
165
+ - labels_3d (Tensor): Labels of bboxes, has a shape
166
+ (num_instances, ).
167
+ - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
168
+ contains a tensor with shape (num_instances, 9).
169
+ """
170
+ batch_input_metas = [item.metainfo for item in batch_data_samples]
171
+ batch_input_metas = self.add_lidar2img(batch_input_metas)
172
+ img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
173
+ outs = self.pts_bbox_head(img_feats, batch_input_metas)
174
+
175
+ results_list_3d = self.pts_bbox_head.predict_by_feat(
176
+ outs, batch_input_metas, **kwargs)
177
+
178
+ # change the bboxes' format
179
+ detsamples = self.add_pred_to_datasample(batch_data_samples,
180
+ results_list_3d)
181
+ return detsamples
182
+
183
+ # may need speed-up
184
+ def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]:
185
+ """add 'lidar2img' transformation matrix into batch_input_metas.
186
+
187
+ Args:
188
+ batch_input_metas (list[dict]): Meta information of multiple inputs
189
+ in a batch.
190
+
191
+ Returns:
192
+ batch_input_metas (list[dict]): Meta info with lidar2img added
193
+ """
194
+ for meta in batch_input_metas:
195
+ l2i = list()
196
+ for i in range(len(meta['cam2img'])):
197
+ c2i = torch.tensor(meta['cam2img'][i]).double()
198
+ l2c = torch.tensor(meta['lidar2cam'][i]).double()
199
+ l2i.append(get_lidar2img(c2i, l2c).float().numpy())
200
+ meta['lidar2img'] = l2i
201
+ return batch_input_metas
model/DETR3D/detr3d_head.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Dict, List, Tuple
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from mmcv.cnn import Linear
7
+ from mmdet.models.dense_heads import DETRHead
8
+ from mmdet.models.layers import inverse_sigmoid
9
+ from mmdet.models.utils import multi_apply
10
+ from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
11
+ from mmengine.model import bias_init_with_prob
12
+ from mmengine.structures import InstanceData
13
+ from torch import Tensor
14
+
15
+ from mmdet3d.registry import MODELS, TASK_UTILS
16
+ from .util import normalize_bbox
17
+
18
+
19
+ @MODELS.register_module()
20
+ class DETR3DHead(DETRHead):
21
+ """Head of DETR3D.
22
+
23
+ Args:
24
+ with_box_refine (bool): Whether to refine the reference points
25
+ in the decoder. Defaults to False.
26
+ as_two_stage (bool) : Whether to generate the proposal from
27
+ the outputs of encoder.
28
+ transformer (obj:`ConfigDict`): ConfigDict is used for building
29
+ the Encoder and Decoder.
30
+ bbox_coder (obj:`ConfigDict`): Configs to build the bbox coder
31
+ num_cls_fcs (int) : the number of layers in cls and reg branch
32
+ code_weights (List[double]) : loss weights of
33
+ (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y)
34
+ code_size (int) : size of code_weights
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ *args,
40
+ with_box_refine=False,
41
+ as_two_stage=False,
42
+ transformer=None,
43
+ bbox_coder=None,
44
+ num_cls_fcs=2,
45
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
46
+ code_size=10,
47
+ num_query=900,
48
+ in_channels=256,
49
+ positional_encoding = None,
50
+ **kwargs):
51
+
52
+ self.with_box_refine = with_box_refine
53
+ self.as_two_stage = as_two_stage
54
+ self.code_weights = code_weights
55
+ self.code_size = code_size
56
+ self.num_query = num_query
57
+ self.in_channels = in_channels
58
+ self.positional_encoding = positional_encoding
59
+
60
+ # Remove unsupported kwargs explicitly
61
+ kwargs.pop('num_query', None)
62
+ kwargs.pop('in_channels', None)
63
+ def dummy_init_layers():
64
+ pass
65
+ self._init_layers = dummy_init_layers
66
+
67
+ # Now call base class constructor (won't crash now)
68
+ super(DETR3DHead, self).__init__(*args, **kwargs)
69
+
70
+ # Build transformer now
71
+ if self.as_two_stage:
72
+ transformer['as_two_stage'] = True
73
+ self.transformer = MODELS.build(transformer)
74
+
75
+ # Set bbox coder and sampler
76
+ self.bbox_coder = TASK_UTILS.build(bbox_coder)
77
+ self.pc_range = self.bbox_coder.pc_range
78
+ self.num_cls_fcs = num_cls_fcs - 1
79
+ sampler_cfg = dict(type='PseudoSampler')
80
+ self.sampler = TASK_UTILS.build(sampler_cfg)
81
+
82
+ # Now call real _init_layers
83
+ self._init_layers = self._real_init_layers # restore
84
+ self._init_layers()
85
+
86
+ self.code_weights = nn.Parameter(
87
+ torch.tensor(self.code_weights, requires_grad=False),
88
+ requires_grad=False)
89
+
90
+ # forward_train -> loss
91
+ def _real_init_layers(self):
92
+ """Initialize classification branch and regression branch of head."""
93
+ cls_branch = []
94
+ for _ in range(self.num_reg_fcs):
95
+ cls_branch.append(Linear(self.embed_dims, self.embed_dims))
96
+ cls_branch.append(nn.LayerNorm(self.embed_dims))
97
+ cls_branch.append(nn.ReLU(inplace=True))
98
+ cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
99
+ fc_cls = nn.Sequential(*cls_branch)
100
+
101
+ reg_branch = []
102
+ for _ in range(self.num_reg_fcs):
103
+ reg_branch.append(Linear(self.embed_dims, self.embed_dims))
104
+ reg_branch.append(nn.ReLU())
105
+ reg_branch.append(Linear(self.embed_dims, self.code_size))
106
+ reg_branch = nn.Sequential(*reg_branch)
107
+
108
+ def _get_clones(module, N):
109
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
110
+
111
+ # last reg_branch is used to generate proposal from
112
+ # encode feature map when as_two_stage is True.
113
+ num_pred = (self.transformer.decoder.num_layers + 1) if \
114
+ self.as_two_stage else self.transformer.decoder.num_layers
115
+
116
+ if self.with_box_refine:
117
+ self.cls_branches = _get_clones(fc_cls, num_pred)
118
+ self.reg_branches = _get_clones(reg_branch, num_pred)
119
+ else:
120
+ self.cls_branches = nn.ModuleList(
121
+ [fc_cls for _ in range(num_pred)])
122
+ self.reg_branches = nn.ModuleList(
123
+ [reg_branch for _ in range(num_pred)])
124
+
125
+ if not self.as_two_stage:
126
+ self.query_embedding = nn.Embedding(self.num_query,
127
+ self.embed_dims * 2)
128
+
129
+ def init_weights(self):
130
+ """Initialize weights of the DeformDETR head."""
131
+ self.transformer.init_weights()
132
+ if self.loss_cls.use_sigmoid:
133
+ bias_init = bias_init_with_prob(0.01)
134
+ for m in self.cls_branches:
135
+ nn.init.constant_(m[-1].bias, bias_init)
136
+
137
+ def forward(self, mlvl_feats: List[Tensor], img_metas: List[Dict],
138
+ **kwargs) -> Dict[str, Tensor]:
139
+ """Forward function.
140
+
141
+ Args:
142
+ mlvl_feats (List[Tensor]): Features from the upstream
143
+ network, each is a 5D-tensor with shape
144
+ (B, N, C, H, W).
145
+ Returns:
146
+ all_cls_scores (Tensor): Outputs from the classification head,
147
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note
148
+ cls_out_channels should includes background.
149
+ all_bbox_preds (Tensor): Sigmoid outputs from the regression
150
+ head with normalized coordinate format
151
+ (cx, cy, l, w, cz, h, sin(φ), cos(φ), vx, vy).
152
+ Shape [nb_dec, bs, num_query, 10].
153
+ """
154
+ query_embeds = self.query_embedding.weight
155
+ hs, init_reference, inter_references = self.transformer(
156
+ mlvl_feats,
157
+ query_embeds,
158
+ reg_branches=self.reg_branches if self.with_box_refine else None,
159
+ img_metas=img_metas,
160
+ **kwargs)
161
+ hs = hs.permute(0, 2, 1, 3)
162
+ outputs_classes = []
163
+ outputs_coords = []
164
+
165
+ for lvl in range(hs.shape[0]):
166
+ if lvl == 0:
167
+ reference = init_reference
168
+ else:
169
+ reference = inter_references[lvl - 1]
170
+ reference = inverse_sigmoid(reference)
171
+ outputs_class = self.cls_branches[lvl](hs[lvl])
172
+ tmp = self.reg_branches[lvl](hs[lvl]) # shape: ([B, num_q, 10])
173
+ # TODO: check the shape of reference
174
+ assert reference.shape[-1] == 3
175
+ tmp[..., 0:2] += reference[..., 0:2]
176
+ tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
177
+ tmp[..., 4:5] += reference[..., 2:3]
178
+ tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
179
+
180
+ tmp[..., 0:1] = \
181
+ tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) \
182
+ + self.pc_range[0]
183
+ tmp[..., 1:2] = \
184
+ tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) \
185
+ + self.pc_range[1]
186
+ tmp[..., 4:5] = \
187
+ tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) \
188
+ + self.pc_range[2]
189
+
190
+ # TODO: check if using sigmoid
191
+ outputs_coord = tmp
192
+ outputs_classes.append(outputs_class)
193
+ outputs_coords.append(outputs_coord)
194
+
195
+ outputs_classes = torch.stack(outputs_classes)
196
+ outputs_coords = torch.stack(outputs_coords)
197
+ outs = {
198
+ 'all_cls_scores': outputs_classes,
199
+ 'all_bbox_preds': outputs_coords,
200
+ 'enc_cls_scores': None,
201
+ 'enc_bbox_preds': None,
202
+ }
203
+ return outs
204
+
205
+ def _get_target_single(
206
+ self,
207
+ cls_score: Tensor, # [query, num_cls]
208
+ bbox_pred: Tensor, # [query, 10]
209
+ gt_instances_3d: InstanceList) -> Tuple[Tensor, ...]:
210
+ """Compute regression and classification targets for a single image."""
211
+ # turn bottm center into gravity center
212
+ gt_bboxes = gt_instances_3d.bboxes_3d # [num_gt, 9]
213
+ gt_bboxes = torch.cat(
214
+ (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1)
215
+
216
+ gt_labels = gt_instances_3d.labels_3d # [num_gt, num_cls]
217
+ # assigner and sampler: PseudoSampler
218
+ assign_result = self.assigner.assign(
219
+ bbox_pred, cls_score, gt_bboxes, gt_labels, gt_bboxes_ignore=None)
220
+ sampling_result = self.sampler.sample(
221
+ assign_result, InstanceData(priors=bbox_pred),
222
+ InstanceData(bboxes_3d=gt_bboxes))
223
+ pos_inds = sampling_result.pos_inds
224
+ neg_inds = sampling_result.neg_inds
225
+
226
+ # label targets
227
+ num_bboxes = bbox_pred.size(0)
228
+ labels = gt_bboxes.new_full((num_bboxes, ),
229
+ self.num_classes,
230
+ dtype=torch.long)
231
+ labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
232
+ label_weights = gt_bboxes.new_ones(num_bboxes)
233
+
234
+ # bbox targets
235
+ # theta in gt_bbox here is still a single scalar
236
+ bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size - 1]
237
+ bbox_weights = torch.zeros_like(bbox_pred)
238
+ # only matched query will learn from bbox coord
239
+ bbox_weights[pos_inds] = 1.0
240
+
241
+ # fix empty gt bug in multi gpu training
242
+ if sampling_result.pos_gt_bboxes.shape[0] == 0:
243
+ sampling_result.pos_gt_bboxes = \
244
+ sampling_result.pos_gt_bboxes.reshape(0, self.code_size - 1)
245
+
246
+ bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
247
+ return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
248
+ neg_inds)
249
+
250
+ def get_targets(
251
+ self,
252
+ batch_cls_scores: List[Tensor], # bs[num_q,num_cls]
253
+ batch_bbox_preds: List[Tensor], # bs[num_q,10]
254
+ batch_gt_instances_3d: InstanceList) -> tuple():
255
+ """"Compute regression and classification targets for a batch image for
256
+ a single decoder layer.
257
+
258
+ Args:
259
+ batch_cls_scores (list[Tensor]): Box score logits from a single
260
+ decoder layer for each image with shape [num_query,
261
+ cls_out_channels].
262
+ batch_bbox_preds (list[Tensor]): Sigmoid outputs from a single
263
+ decoder layer for each image, with normalized coordinate
264
+ (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and
265
+ shape [num_query, 10]
266
+ batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
267
+ gt_instance. It usually includes ``bboxes_3d``、``labels_3d``.
268
+ Returns:
269
+ tuple: a tuple containing the following targets.
270
+ - labels_list (list[Tensor]): Labels for all images.
271
+ - label_weights_list (list[Tensor]): Label weights for all \
272
+ images.
273
+ - bbox_targets_list (list[Tensor]): BBox targets for all \
274
+ images.
275
+ - bbox_weights_list (list[Tensor]): BBox weights for all \
276
+ images.
277
+ - num_total_pos (int): Number of positive samples in all \
278
+ images.
279
+ - num_total_neg (int): Number of negative samples in all \
280
+ images.
281
+ """
282
+ (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
283
+ pos_inds_list, neg_inds_list) = multi_apply(self._get_target_single,
284
+ batch_cls_scores,
285
+ batch_bbox_preds,
286
+ batch_gt_instances_3d)
287
+
288
+ num_total_pos = sum((inds.numel() for inds in pos_inds_list))
289
+ num_total_neg = sum((inds.numel() for inds in neg_inds_list))
290
+ return (labels_list, label_weights_list, bbox_targets_list,
291
+ bbox_weights_list, num_total_pos, num_total_neg)
292
+
293
+ def loss_by_feat_single(
294
+ self,
295
+ batch_cls_scores: Tensor, # bs,num_q,num_cls
296
+ batch_bbox_preds: Tensor, # bs,num_q,10
297
+ batch_gt_instances_3d: InstanceList
298
+ ) -> Tuple[Tensor, Tensor]:
299
+ """"Loss function for outputs from a single decoder layer of a single
300
+ feature level.
301
+
302
+ Args:
303
+ batch_cls_scores (Tensor): Box score logits from a single
304
+ decoder layer for batched images with shape [num_query,
305
+ cls_out_channels].
306
+ batch_bbox_preds (Tensor): Sigmoid outputs from a single
307
+ decoder layer for batched images, with normalized coordinate
308
+ (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and
309
+ shape [num_query, 10]
310
+ batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
311
+ gt_instance_3d. It usually has ``bboxes_3d``,``labels_3d``.
312
+ Returns:
313
+ tulple(Tensor, Tensor): cls and reg loss for outputs from
314
+ a single decoder layer.
315
+ """
316
+ batch_size = batch_cls_scores.size(0) # batch size
317
+ cls_scores_list = [batch_cls_scores[i] for i in range(batch_size)]
318
+ bbox_preds_list = [batch_bbox_preds[i] for i in range(batch_size)]
319
+ cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
320
+ batch_gt_instances_3d)
321
+
322
+ (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
323
+ num_total_pos, num_total_neg) = cls_reg_targets
324
+ labels = torch.cat(labels_list, 0)
325
+ label_weights = torch.cat(label_weights_list, 0)
326
+ bbox_targets = torch.cat(bbox_targets_list, 0)
327
+ bbox_weights = torch.cat(bbox_weights_list, 0)
328
+
329
+ # classification loss
330
+ batch_cls_scores = batch_cls_scores.reshape(-1, self.cls_out_channels)
331
+ # construct weighted avg_factor to match with the official DETR repo
332
+ cls_avg_factor = num_total_pos * 1.0 + \
333
+ num_total_neg * self.bg_cls_weight
334
+ if self.sync_cls_avg_factor:
335
+ cls_avg_factor = reduce_mean(
336
+ batch_cls_scores.new_tensor([cls_avg_factor]))
337
+
338
+ cls_avg_factor = max(cls_avg_factor, 1)
339
+ loss_cls = self.loss_cls(
340
+ batch_cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
341
+
342
+ # Compute the average number of gt boxes across all gpus, for
343
+ # normalization purposes
344
+ num_total_pos = loss_cls.new_tensor([num_total_pos])
345
+ num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
346
+
347
+ # regression L1 loss
348
+ batch_bbox_preds = batch_bbox_preds.reshape(-1,
349
+ batch_bbox_preds.size(-1))
350
+ normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
351
+ # neg_query is all 0, log(0) is NaN
352
+ isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
353
+ bbox_weights = bbox_weights * self.code_weights
354
+
355
+ loss_bbox = self.loss_bbox(
356
+ batch_bbox_preds[isnotnan, :self.code_size],
357
+ normalized_bbox_targets[isnotnan, :self.code_size],
358
+ bbox_weights[isnotnan, :self.code_size],
359
+ avg_factor=num_total_pos)
360
+
361
+ loss_cls = torch.nan_to_num(loss_cls)
362
+ loss_bbox = torch.nan_to_num(loss_bbox)
363
+ return loss_cls, loss_bbox
364
+
365
+ # original loss()
366
+ def loss_by_feat(
367
+ self,
368
+ batch_gt_instances_3d: InstanceList,
369
+ preds_dicts: Dict[str, Tensor],
370
+ batch_gt_instances_3d_ignore: OptInstanceList = None) -> Dict:
371
+ """Compute loss of the head.
372
+
373
+ Args:
374
+ batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
375
+ gt_instance_3d. It usually includes ``bboxes_3d``、`
376
+ `labels_3d``、``depths``、``centers_2d`` and attributes.
377
+ gt_instance. It usually includes ``bboxes``、``labels``.
378
+ batch_gt_instances_3d_ignore (list[:obj:`InstanceData`], Optional):
379
+ NOT supported.
380
+ Defaults to None.
381
+
382
+ Returns:
383
+ dict[str, Tensor]: A dictionary of loss components.
384
+ """
385
+ assert batch_gt_instances_3d_ignore is None, \
386
+ f'{self.__class__.__name__} only supports ' \
387
+ f'for batch_gt_instances_3d_ignore setting to None.'
388
+ all_cls_scores = preds_dicts[
389
+ 'all_cls_scores'] # num_dec,bs,num_q,num_cls
390
+ all_bbox_preds = preds_dicts['all_bbox_preds'] # num_dec,bs,num_q,10
391
+ enc_cls_scores = preds_dicts['enc_cls_scores']
392
+ enc_bbox_preds = preds_dicts['enc_bbox_preds']
393
+
394
+ # calculate loss for each decoder layer
395
+ num_dec_layers = len(all_cls_scores)
396
+ batch_gt_instances_3d_list = [
397
+ batch_gt_instances_3d for _ in range(num_dec_layers)
398
+ ]
399
+ losses_cls, losses_bbox = multi_apply(self.loss_by_feat_single,
400
+ all_cls_scores, all_bbox_preds,
401
+ batch_gt_instances_3d_list)
402
+
403
+ loss_dict = dict()
404
+ # loss of proposal generated from encode feature map.
405
+ if enc_cls_scores is not None:
406
+ enc_loss_cls, enc_losses_bbox = self.loss_by_feat_single(
407
+ enc_cls_scores, enc_bbox_preds, batch_gt_instances_3d_list)
408
+ loss_dict['enc_loss_cls'] = enc_loss_cls
409
+ loss_dict['enc_loss_bbox'] = enc_losses_bbox
410
+
411
+ # loss from the last decoder layer
412
+ loss_dict['loss_cls'] = losses_cls[-1]
413
+ loss_dict['loss_bbox'] = losses_bbox[-1]
414
+
415
+ # loss from other decoder layers
416
+ num_dec_layer = 0
417
+ for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
418
+ loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
419
+ loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
420
+ num_dec_layer += 1
421
+ return loss_dict
422
+
423
+ def predict_by_feat(self,
424
+ preds_dicts,
425
+ img_metas,
426
+ rescale=False) -> InstanceList:
427
+ """Transform network output for a batch into bbox predictions.
428
+
429
+ Args:
430
+ preds_dicts (Dict[str, Tensor]):
431
+ -all_cls_scores (Tensor): Outputs from the classification head,
432
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note
433
+ cls_out_channels should includes background.
434
+ -all_bbox_preds (Tensor): Sigmoid outputs from the regression
435
+ head with normalized coordinate format
436
+ (cx, cy, l, w, cz, h, rot_sine, rot_cosine, v_x, v_y).
437
+ Shape [nb_dec, bs, num_query, 10].
438
+ batch_img_metas (list[dict]): Meta information of each image, e.g.,
439
+ image size, scaling factor, etc.
440
+ rescale (bool): If True, return boxes in original image space.
441
+ Defaults to False.
442
+
443
+ Returns:
444
+ list[:obj:`InstanceData`]: Object detection results of each image
445
+ after the post process. Each item usually contains following keys.
446
+
447
+ - scores_3d (Tensor): Classification scores, has a shape
448
+ (num_instance, )
449
+ - labels_3d (Tensor): Labels of bboxes, has a shape
450
+ (num_instances, ).
451
+ - bboxes_3d (Tensor): Contains a tensor with shape
452
+ (num_instances, C), where C >= 7.
453
+ """
454
+ # sinθ & cosθ ---> θ
455
+ preds_dicts = self.bbox_coder.decode(preds_dicts)
456
+ num_samples = len(preds_dicts) # batch size
457
+ ret_list = []
458
+ for i in range(num_samples):
459
+ results = InstanceData()
460
+ preds = preds_dicts[i]
461
+ bboxes = preds['bboxes']
462
+ bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
463
+ bboxes = img_metas[i]['box_type_3d'](bboxes, self.code_size - 1)
464
+
465
+ results.bboxes_3d = bboxes
466
+ results.scores_3d = preds['scores']
467
+ results.labels_3d = preds['labels']
468
+ ret_list.append(results)
469
+ return ret_list
model/DETR3D/detr3d_r101_gridmask.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_scope = 'mmdet3d'
2
+
3
+ default_hooks = dict(
4
+ timer=dict(type='IterTimerHook'),
5
+ logger=dict(type='LoggerHook', interval=50),
6
+ param_scheduler=dict(type='ParamSchedulerHook'),
7
+ checkpoint=dict(type='CheckpointHook', interval=-1),
8
+ sampler_seed=dict(type='DistSamplerSeedHook'),
9
+ visualization=dict(type='Det3DVisualizationHook'))
10
+
11
+ env_cfg = dict(
12
+ cudnn_benchmark=False,
13
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
14
+ dist_cfg=dict(backend='nccl'),
15
+ )
16
+
17
+ log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
18
+
19
+ log_level = 'INFO'
20
+ load_from = None
21
+ resume = False
22
+
23
+
24
+
25
+ custom_imports = dict(imports=['projects.DETR3D.detr3d'])
26
+
27
+
28
+ # If point cloud range is changed, the models should also change their point
29
+ # cloud range accordingly
30
+ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
31
+ voxel_size = [0.2, 0.2, 8]
32
+
33
+ img_norm_cfg = dict(
34
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], bgr_to_rgb=False)
35
+ # For nuScenes we usually do 10-class detection
36
+ class_names = [
37
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
38
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
39
+ ]
40
+
41
+ input_modality = dict(
42
+ use_lidar=False,
43
+ use_camera=True,
44
+ use_radar=False,
45
+ use_map=False,
46
+ use_external=False)
47
+ # this means type='DETR3D' will be processed as 'mmdet3d.DETR3D'
48
+ default_scope = 'mmdet3d'
49
+ model = dict(
50
+ type='DETR3D',
51
+ use_grid_mask=True,
52
+ data_preprocessor=dict(
53
+ type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32),
54
+ img_backbone=dict(
55
+ type='mmdet.RegNet',
56
+ arch='regnetx_4.0gf',
57
+ out_indices=(0,1,2,3),
58
+ init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')
59
+ ),
60
+ img_neck=dict(
61
+ type='mmdet.FPN',
62
+ in_channels=[80, 240, 560, 1360],
63
+ out_channels=256,
64
+ start_level=1,
65
+ add_extra_convs='on_output',
66
+ num_outs=4,
67
+ relu_before_extra_convs=True),
68
+ pts_bbox_head=dict(
69
+ type='DETR3DHead',
70
+ num_query=900,
71
+ num_classes=10,
72
+ in_channels=256,
73
+ sync_cls_avg_factor=True,
74
+ with_box_refine=True,
75
+ as_two_stage=False,
76
+ transformer=dict(
77
+ type='Detr3DTransformer',
78
+ decoder=dict(
79
+ type='Detr3DTransformerDecoder',
80
+ num_layers=6,
81
+ return_intermediate=True,
82
+ transformerlayers=dict(
83
+ type='BaseTransformerLayer',
84
+ attn_cfgs=[
85
+ dict(
86
+ type='MultiheadAttention', # mmcv.
87
+ embed_dims=256,
88
+ num_heads=8,
89
+ dropout=0.1),
90
+ dict(
91
+ type='Detr3DCrossAtten',
92
+ pc_range=point_cloud_range,
93
+ num_points=4,
94
+ embed_dims=256)
95
+ ],
96
+ feedforward_channels=512,
97
+ ffn_dropout=0.1,
98
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
99
+ 'ffn', 'norm')))),
100
+ bbox_coder=dict(
101
+ type='NMSFreeCoder',
102
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
103
+ pc_range=point_cloud_range,
104
+ max_num=300,
105
+ voxel_size=voxel_size,
106
+ num_classes=10),
107
+ positional_encoding=dict(
108
+ type='mmdet.SinePositionalEncoding',
109
+ num_feats=128,
110
+ normalize=True,
111
+ offset=-0.5),
112
+ loss_cls=dict(
113
+ type='mmdet.FocalLoss',
114
+ use_sigmoid=True,
115
+ gamma=2.0,
116
+ alpha=0.25,
117
+ loss_weight=2.0),
118
+ loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
119
+ loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.5)),
120
+ # model training and testing settings
121
+ train_cfg=dict(
122
+ pts=dict(
123
+ grid_size=[512, 512, 1],
124
+ voxel_size=voxel_size,
125
+ point_cloud_range=point_cloud_range,
126
+ out_size_factor=2,
127
+ assigner=dict(
128
+ type='HungarianAssigner3D',
129
+ cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
130
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.5),
131
+ # ↓ Fake cost. This is just to get compatible with DETR head
132
+ iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
133
+ pc_range=point_cloud_range))))
134
+
135
+ dataset_type = 'NuScenesDataset'
136
+ data_root = 'data/nuscenes/'
137
+
138
+ test_transforms = [
139
+ dict(
140
+ type='RandomResize3D',
141
+ scale=(800, 450),
142
+ ratio_range=(1., 1.),
143
+ keep_ratio=True)
144
+
145
+ ]
146
+
147
+ # test_transforms = [
148
+ # dict(
149
+ # type='RandomResize3D',
150
+ # scale=(1400, 800),
151
+ # ratio_range=(0.8, 1.2),
152
+ # keep_ratio=True
153
+ # ),
154
+ # ]
155
+
156
+ train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms
157
+ # train_transforms = [
158
+ # dict(type='PhotoMetricDistortion3D'),
159
+ # dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
160
+ # dict(
161
+ # type='GlobalRotScaleTrans',
162
+ # rot_range=[-0.3925, 0.3925],
163
+ # scale_ratio_range=[0.9, 1.1],
164
+ # translation_std=[0, 0, 0]
165
+ # ),
166
+ # ] + test_transforms
167
+
168
+ backend_args = None
169
+ train_pipeline = [
170
+ dict(
171
+ type='LoadMultiViewImageFromFiles',
172
+ to_float32=True,
173
+ num_views=6,
174
+ backend_args=backend_args),
175
+ dict(
176
+ type='LoadAnnotations3D',
177
+ with_bbox_3d=True,
178
+ with_label_3d=True,
179
+ with_attr_label=False),
180
+ dict(type='MultiViewWrapper', transforms=train_transforms),
181
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
182
+ dict(type='ObjectNameFilter', classes=class_names),
183
+ dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
184
+ ]
185
+
186
+ test_pipeline = [
187
+ dict(
188
+ type='LoadMultiViewImageFromFiles',
189
+ to_float32=True,
190
+ num_views=6,
191
+ backend_args=backend_args),
192
+ dict(type='MultiViewWrapper', transforms=test_transforms),
193
+ dict(type='Pack3DDetInputs', keys=['img'])
194
+ ]
195
+
196
+ metainfo = dict(classes=class_names)
197
+ data_prefix = dict(
198
+ pts='',
199
+ CAM_FRONT='samples/CAM_FRONT',
200
+ CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
201
+ CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
202
+ CAM_BACK='samples/CAM_BACK',
203
+ CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
204
+ CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
205
+
206
+ train_dataloader = dict(
207
+ batch_size=2,
208
+ num_workers=8,
209
+ persistent_workers=True,
210
+ drop_last=False,
211
+ sampler=dict(type='DefaultSampler', shuffle=True),
212
+ # sampler=dict(
213
+ # type='ClassBalancedDataset',
214
+ # dataset=dict(type='DefaultSampler', shuffle=True),
215
+ # oversample_thr=0.001),
216
+ dataset=dict(
217
+ type=dataset_type,
218
+ data_root=data_root,
219
+ ann_file='nuscenes_infos_train.pkl',
220
+ pipeline=train_pipeline,
221
+ load_type='frame_based',
222
+ metainfo=metainfo,
223
+ modality=input_modality,
224
+ test_mode=False,
225
+ data_prefix=data_prefix,
226
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
227
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
228
+ box_type_3d='LiDAR',
229
+ backend_args=backend_args))
230
+
231
+ val_dataloader = dict(
232
+ batch_size=2,
233
+ num_workers=8,
234
+ persistent_workers=True,
235
+ drop_last=False,
236
+ sampler=dict(type='DefaultSampler', shuffle=False),
237
+ dataset=dict(
238
+ type=dataset_type,
239
+ data_root=data_root,
240
+ ann_file='nuscenes_infos_val.pkl',
241
+ load_type='frame_based',
242
+ pipeline=test_pipeline,
243
+ metainfo=metainfo,
244
+ modality=input_modality,
245
+ test_mode=True,
246
+ data_prefix=data_prefix,
247
+ box_type_3d='LiDAR',
248
+ backend_args=backend_args))
249
+
250
+ test_dataloader = val_dataloader
251
+
252
+ val_evaluator = dict(
253
+ type='NuScenesMetric',
254
+ data_root=data_root,
255
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
256
+ metric='bbox',
257
+ backend_args=backend_args)
258
+ test_evaluator = val_evaluator
259
+
260
+ optim_wrapper = dict(
261
+ type='OptimWrapper',
262
+ optimizer=dict(type='AdamW', lr=1e-4, weight_decay=0.01),
263
+ paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
264
+ clip_grad=dict(max_norm=35, norm_type=2),
265
+ )
266
+
267
+ # learning policy
268
+ param_scheduler = [
269
+ dict(
270
+ type='LinearLR',
271
+ start_factor=1.0 / 3,
272
+ by_epoch=False,
273
+ begin=0,
274
+ end=14000),
275
+ dict(
276
+ type='CosineAnnealingLR',
277
+ by_epoch=True,
278
+ begin=0,
279
+ end=50,
280
+ T_max=50,
281
+ eta_min_ratio=1e-3)
282
+ ]
283
+
284
+ total_epochs = 50
285
+
286
+ train_cfg = dict(
287
+ type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=2)
288
+ val_cfg = dict(type='ValLoop')
289
+ test_cfg = dict(type='TestLoop')
290
+ default_hooks = dict(
291
+ checkpoint=dict(
292
+ type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
293
+ # load_from = 'work_dirs/detr3d_nuscenes/epoch_30.pth'
294
+
295
+ # setuptools 65 downgrades to 58.
296
+ # In mmlab-node we use setuptools 61 but occurs NO errors
297
+ vis_backends = [dict(type='TensorboardVisBackend')]
298
+ visualizer = dict(
299
+ type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')