3dtest / projects /TPVFormer /configs /tpvformer_8xb1-2x_nus-seg.py
giantmonkeyTC
mm2
c2ca15f
_base_ = ['../../../configs/_base_/default_runtime.py']
custom_imports = dict(
imports=['projects.TPVFormer.tpvformer'], allow_failed_imports=False)
dataset_type = 'NuScenesSegDataset'
data_root = 'data/nuscenes/'
data_prefix = dict(
pts='samples/LIDAR_TOP',
pts_semantic_mask='lidarseg/v1.0-trainval',
CAM_FRONT='samples/CAM_FRONT',
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
CAM_BACK='samples/CAM_BACK',
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
backend_args = None
train_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=False,
color_type='unchanged',
num_views=6,
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=3,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
with_attr_label=False,
seg_3d_dtype='np.uint8'),
dict(
type='MultiViewWrapper',
transforms=dict(type='PhotoMetricDistortion3D')),
dict(type='SegLabelMapping'),
dict(
type='Pack3DDetInputs',
keys=['img', 'points', 'pts_semantic_mask'],
meta_keys=['lidar2img'])
]
val_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=False,
color_type='unchanged',
num_views=6,
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=3,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=False,
with_label_3d=False,
with_seg_3d=True,
with_attr_label=False,
seg_3d_dtype='np.uint8'),
dict(type='SegLabelMapping'),
dict(
type='Pack3DDetInputs',
keys=['img', 'points', 'pts_semantic_mask'],
meta_keys=['lidar2img'])
]
test_pipeline = val_pipeline
train_dataloader = dict(
batch_size=1,
num_workers=4,
persistent_workers=True,
drop_last=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=data_prefix,
ann_file='nuscenes_infos_train.pkl',
pipeline=train_pipeline,
test_mode=False))
val_dataloader = dict(
batch_size=1,
num_workers=4,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_prefix=data_prefix,
ann_file='nuscenes_infos_val.pkl',
pipeline=val_pipeline,
test_mode=True))
test_dataloader = val_dataloader
val_evaluator = dict(type='SegMetric')
test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
paramwise_cfg=dict(custom_keys={
'backbone': dict(lr_mult=0.1),
}),
clip_grad=dict(max_norm=35, norm_type=2),
)
param_scheduler = [
dict(type='LinearLR', start_factor=1e-5, by_epoch=False, begin=0, end=500),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=24,
by_epoch=True,
eta_min=1e-6,
convert_to_iter_based=True)
]
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
_dim_ = 128
num_heads = 8
_ffn_dim_ = _dim_ * 2
tpv_h_ = 200
tpv_w_ = 200
tpv_z_ = 16
scale_h = 1
scale_w = 1
scale_z = 1
num_points_in_pillar = [4, 32, 32]
num_points = [8, 64, 64]
hybrid_attn_anchors = 16
hybrid_attn_points = 32
hybrid_attn_init = 0
grid_shape = [tpv_h_ * scale_h, tpv_w_ * scale_w, tpv_z_ * scale_z]
self_cross_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='TPVCrossViewHybridAttention',
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_,
num_anchors=hybrid_attn_anchors,
embed_dims=_dim_,
num_heads=num_heads,
num_points=hybrid_attn_points,
init_mode=hybrid_attn_init,
dropout=0.1),
dict(
type='TPVImageCrossAttention',
pc_range=point_cloud_range,
num_cams=6,
dropout=0.1,
deformable_attention=dict(
type='TPVMSDeformableAttention3D',
embed_dims=_dim_,
num_heads=num_heads,
num_points=num_points,
num_z_anchors=num_points_in_pillar,
num_levels=4,
floor_sampling_offset=False,
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_),
embed_dims=_dim_,
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm'))
self_layer = dict(
type='TPVFormerLayer',
attn_cfgs=[
dict(
type='TPVCrossViewHybridAttention',
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_,
num_anchors=hybrid_attn_anchors,
embed_dims=_dim_,
num_heads=num_heads,
num_points=hybrid_attn_points,
init_mode=hybrid_attn_init,
dropout=0.1)
],
feedforward_channels=_ffn_dim_,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'ffn', 'norm'))
model = dict(
type='TPVFormer',
data_preprocessor=dict(
type='TPVFormerDataPreprocessor',
pad_size_divisor=32,
mean=[103.530, 116.280, 123.675],
std=[1.0, 1.0, 1.0],
voxel=True,
voxel_type='cylindrical',
voxel_layer=dict(
grid_shape=grid_shape,
point_cloud_range=point_cloud_range,
max_num_points=-1,
max_voxels=-1,
),
batch_augments=[
dict(
type='GridMask',
use_h=True,
use_w=True,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.7)
]),
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN2d', requires_grad=False),
norm_eval=True,
style='caffe',
dcn=dict(
type='DCNv2', deform_groups=1, fallback_on_stride=False
), # original DCNv2 will print log when perform load_state_dict
stage_with_dcn=(False, False, True, True),
init_cfg=dict(
type='Pretrained',
checkpoint='checkpoints/tpvformer_pretrained_fcos3d_r101_dcn.pth',
prefix='backbone.')),
neck=dict(
type='mmdet.FPN',
in_channels=[512, 1024, 2048],
out_channels=_dim_,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True,
init_cfg=dict(
type='Pretrained',
checkpoint='checkpoints/tpvformer_pretrained_fcos3d_r101_dcn.pth',
prefix='neck.')),
encoder=dict(
type='TPVFormerEncoder',
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_,
num_layers=5,
pc_range=point_cloud_range,
num_points_in_pillar=num_points_in_pillar,
num_points_in_pillar_cross_view=[16, 16, 16],
return_intermediate=False,
transformerlayers=[
self_cross_layer, self_cross_layer, self_cross_layer, self_layer,
self_layer
],
embed_dims=_dim_,
positional_encoding=dict(
type='TPVFormerPositionalEncoding',
num_feats=[48, 48, 32],
h=tpv_h_,
w=tpv_w_,
z=tpv_z_)),
decode_head=dict(
type='TPVFormerDecoder',
tpv_h=tpv_h_,
tpv_w=tpv_w_,
tpv_z=tpv_z_,
num_classes=17,
in_dims=_dim_,
hidden_dims=2 * _dim_,
out_dims=_dim_,
scale_h=scale_h,
scale_w=scale_w,
scale_z=scale_z,
loss_ce=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
class_weight=None,
avg_non_ignore=True,
loss_weight=1.0),
loss_lovasz=dict(type='LovaszLoss', loss_weight=1.0, reduction='none'),
lovasz_input='points',
ce_input='voxel',
ignore_index=0))