mm3dtest / projects /PETR /configs /petr_vovnet_gridmask_p4_800x320.py
giantmonkeyTC
2344
34d1f8b
_base_ = [
'../../../configs/_base_/datasets/nus-3d.py',
'../../../configs/_base_/default_runtime.py',
'../../../configs/_base_/schedules/cyclic-20e.py'
]
backbone_norm_cfg = dict(type='LN', requires_grad=True)
custom_imports = dict(imports=['projects.PETR.petr'])
randomness = dict(seed=1, deterministic=False, diff_rank_seed=False)
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
voxel_size = [0.2, 0.2, 8]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675],
std=[57.375, 57.120, 58.395],
to_rgb=False)
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
metainfo = dict(classes=class_names)
input_modality = dict(use_camera=True)
model = dict(
type='PETR',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[103.530, 116.280, 123.675],
std=[57.375, 57.120, 58.395],
bgr_to_rgb=False,
pad_size_divisor=32),
use_grid_mask=True,
img_backbone=dict(
type='VoVNetCP',
spec_name='V-99-eSE',
norm_eval=True,
frozen_stages=-1,
input_ch=3,
out_features=(
'stage4',
'stage5',
)),
img_neck=dict(
type='CPFPN', in_channels=[768, 1024], out_channels=256, num_outs=2),
pts_bbox_head=dict(
type='PETRHead',
num_classes=10,
in_channels=256,
num_query=900,
LID=True,
with_position=True,
with_multiview=True,
position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
normedlinear=False,
transformer=dict(
type='PETRTransformer',
decoder=dict(
type='PETRTransformerDecoder',
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='PETRTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.1,
dropout_layer=dict(type='Dropout', drop_prob=0.1)),
dict(
type='PETRMultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.1,
dropout_layer=dict(type='Dropout', drop_prob=0.1)),
],
feedforward_channels=2048,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')),
)),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='SinePositionalEncoding3D', num_feats=128, normalize=True),
loss_cls=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
# model training and testing settings
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
iou_cost=dict(
type='IoUCost', weight=0.0
), # Fake cost. Just to be compatible with DETR head.
pc_range=point_cloud_range))))
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
backend_args = None
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
backend_args=backend_args),
backend_args=backend_args)
ida_aug_conf = {
'resize_lim': (0.47, 0.625),
'final_dim': (320, 800),
'bot_pct_lim': (0.0, 0.0),
'rot_lim': (0.0, 0.0),
'H': 900,
'W': 1600,
'rand_flip': True,
}
train_pipeline = [
dict(
type='LoadMultiViewImageFromFiles',
to_float32=True,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(
type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True),
dict(
type='GlobalRotScaleTransImage',
rot_range=[-0.3925, 0.3925],
translation_std=[0, 0, 0],
scale_ratio_range=[0.95, 1.05],
reverse_angle=False,
training=True),
dict(
type='Pack3DDetInputs',
keys=[
'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
])
]
test_pipeline = [
dict(
type='LoadMultiViewImageFromFiles',
to_float32=True,
backend_args=backend_args),
dict(
type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf,
training=False),
dict(type='Pack3DDetInputs', keys=['img'])
]
train_dataloader = dict(
batch_size=1,
num_workers=4,
dataset=dict(
type=dataset_type,
data_prefix=dict(
pts='samples/LIDAR_TOP',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
pipeline=train_pipeline,
box_type_3d='LiDAR',
metainfo=metainfo,
test_mode=False,
modality=input_modality,
use_valid_flag=True,
backend_args=backend_args))
test_dataloader = dict(
dataset=dict(
type=dataset_type,
data_prefix=dict(
pts='samples/LIDAR_TOP',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
pipeline=test_pipeline,
box_type_3d='LiDAR',
metainfo=metainfo,
test_mode=True,
modality=input_modality,
use_valid_flag=True,
backend_args=backend_args))
val_dataloader = dict(
dataset=dict(
type=dataset_type,
data_prefix=dict(
pts='samples/LIDAR_TOP',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
pipeline=test_pipeline,
box_type_3d='LiDAR',
metainfo=metainfo,
test_mode=True,
modality=input_modality,
use_valid_flag=True,
backend_args=backend_args))
# Different from original PETR:
# We don't use special lr for image_backbone
# This seems won't affect model performance
optim_wrapper = dict(
# TODO Add Amp
# type='AmpOptimWrapper',
# loss_scale='dynamic',
optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
paramwise_cfg=dict(custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
clip_grad=dict(max_norm=35, norm_type=2))
num_epochs = 24
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0 / 3,
begin=0,
end=500,
by_epoch=False),
dict(
type='CosineAnnealingLR',
# TODO Figure out what T_max
T_max=num_epochs,
by_epoch=True,
)
]
train_cfg = dict(max_epochs=num_epochs, val_interval=num_epochs)
find_unused_parameters = False
# pretrain_path can be found here:
# https://drive.google.com/file/d/1ABI5BoQCkCkP4B0pO5KBJ3Ni0tei0gZi/view
load_from = '/mnt/d/fcos3d_vovnet_imgbackbone-remapped.pth'
resume = False
# --------------Original---------------
# mAP: 0.3778
# mATE: 0.7463
# mASE: 0.2718
# mAOE: 0.4883
# mAVE: 0.9062
# mAAE: 0.2123
# NDS: 0.4264
# Eval time: 242.1s
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.556 0.555 0.153 0.091 0.917 0.216
# truck 0.330 0.805 0.218 0.119 0.859 0.250
# bus 0.412 0.789 0.205 0.162 2.067 0.337
# trailer 0.221 0.976 0.233 0.663 0.797 0.146
# construction_vehicle 0.094 1.096 0.493 1.145 0.190 0.349
# pedestrian 0.453 0.688 0.289 0.636 0.549 0.235
# motorcycle 0.368 0.690 0.256 0.622 1.417 0.149
# bicycle 0.341 0.609 0.270 0.812 0.455 0.017
# traffic_cone 0.531 0.582 0.320 nan nan nan
# barrier 0.472 0.673 0.281 0.145 nan nan
# --------------Refactored in mmdet3d v1.0---------------
# mAP: 0.3827
# mATE: 0.7375
# mASE: 0.2703
# mAOE: 0.4799
# mAVE: 0.8699
# mAAE: 0.2038
# NDS: 0.4352
# Eval time: 124.8s
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.574 0.519 0.150 0.087 0.865 0.206
# truck 0.349 0.773 0.213 0.117 0.855 0.220
# bus 0.423 0.781 0.204 0.122 1.902 0.319
# trailer 0.219 1.034 0.231 0.608 0.830 0.149
# construction_vehicle 0.084 1.062 0.486 1.245 0.172 0.360
# pedestrian 0.452 0.681 0.293 0.646 0.529 0.231
# motorcycle 0.378 0.670 0.250 0.567 1.334 0.130
# bicycle 0.347 0.639 0.264 0.788 0.472 0.016
# traffic_cone 0.538 0.553 0.325 nan nan nan
# barrier 0.464 0.662 0.287 0.137 nan nan
# --------------Refactored in mmdet3d v1.1---------------
# mAP: 0.3830
# mATE: 0.7547
# mASE: 0.2683
# mAOE: 0.4948
# mAVE: 0.8331
# mAAE: 0.2056
# NDS: 0.4358
# Eval time: 118.7s
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.567 0.538 0.151 0.086 0.873 0.212
# truck 0.341 0.785 0.213 0.113 0.821 0.234
# bus 0.426 0.766 0.201 0.128 1.813 0.343
# trailer 0.216 1.116 0.227 0.649 0.640 0.122
# construction_vehicle 0.093 1.118 0.483 1.292 0.217 0.330
# pedestrian 0.453 0.685 0.293 0.644 0.535 0.238
# motorcycle 0.374 0.700 0.253 0.624 1.291 0.154
# bicycle 0.345 0.622 0.262 0.775 0.475 0.011
# traffic_cone 0.539 0.557 0.319 nan nan nan
# barrier 0.476 0.661 0.279 0.142 nan nan