gntmky
/

mm3dtest

Model card Files Files and versions Community

mm3dtest / projects /PETR /configs /petr_vovnet_gridmask_p4_800x320.py

giantmonkeyTC

2344

34d1f8b 5 months ago

12.1 kB

	_base_ = [
	'../../../configs/_base_/datasets/nus-3d.py',
	'../../../configs/_base_/default_runtime.py',
	'../../../configs/_base_/schedules/cyclic-20e.py'
	]
	backbone_norm_cfg = dict(type='LN', requires_grad=True)
	custom_imports = dict(imports=['projects.PETR.petr'])

	randomness = dict(seed=1, deterministic=False, diff_rank_seed=False)
	# If point cloud range is changed, the models should also change their point
	# cloud range accordingly
	point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
	voxel_size = [0.2, 0.2, 8]
	img_norm_cfg = dict(
	mean=[103.530, 116.280, 123.675],
	std=[57.375, 57.120, 58.395],
	to_rgb=False)
	# For nuScenes we usually do 10-class detection
	class_names = [
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
	'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	]
	metainfo = dict(classes=class_names)

	input_modality = dict(use_camera=True)
	model = dict(
	type='PETR',
	data_preprocessor=dict(
	type='Det3DDataPreprocessor',
	mean=[103.530, 116.280, 123.675],
	std=[57.375, 57.120, 58.395],
	bgr_to_rgb=False,
	pad_size_divisor=32),
	use_grid_mask=True,
	img_backbone=dict(
	type='VoVNetCP',
	spec_name='V-99-eSE',
	norm_eval=True,
	frozen_stages=-1,
	input_ch=3,
	out_features=(
	'stage4',
	'stage5',
	)),
	img_neck=dict(
	type='CPFPN', in_channels=[768, 1024], out_channels=256, num_outs=2),
	pts_bbox_head=dict(
	type='PETRHead',
	num_classes=10,
	in_channels=256,
	num_query=900,
	LID=True,
	with_position=True,
	with_multiview=True,
	position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
	normedlinear=False,
	transformer=dict(
	type='PETRTransformer',
	decoder=dict(
	type='PETRTransformerDecoder',
	return_intermediate=True,
	num_layers=6,
	transformerlayers=dict(
	type='PETRTransformerDecoderLayer',
	attn_cfgs=[
	dict(
	type='MultiheadAttention',
	embed_dims=256,
	num_heads=8,
	attn_drop=0.1,
	dropout_layer=dict(type='Dropout', drop_prob=0.1)),
	dict(
	type='PETRMultiheadAttention',
	embed_dims=256,
	num_heads=8,
	attn_drop=0.1,
	dropout_layer=dict(type='Dropout', drop_prob=0.1)),
	],
	feedforward_channels=2048,
	ffn_dropout=0.1,
	operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
	'ffn', 'norm')),
	)),
	bbox_coder=dict(
	type='NMSFreeCoder',
	post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
	pc_range=point_cloud_range,
	max_num=300,
	voxel_size=voxel_size,
	num_classes=10),
	positional_encoding=dict(
	type='SinePositionalEncoding3D', num_feats=128, normalize=True),
	loss_cls=dict(
	type='mmdet.FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=2.0),
	loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
	loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
	# model training and testing settings
	train_cfg=dict(
	pts=dict(
	grid_size=[512, 512, 1],
	voxel_size=voxel_size,
	point_cloud_range=point_cloud_range,
	out_size_factor=4,
	assigner=dict(
	type='HungarianAssigner3D',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
	iou_cost=dict(
	type='IoUCost', weight=0.0
	), # Fake cost. Just to be compatible with DETR head.
	pc_range=point_cloud_range))))

	dataset_type = 'NuScenesDataset'
	data_root = 'data/nuscenes/'
	backend_args = None

	db_sampler = dict(
	data_root=data_root,
	info_path=data_root + 'nuscenes_dbinfos_train.pkl',
	rate=1.0,
	prepare=dict(
	filter_by_difficulty=[-1],
	filter_by_min_points=dict(
	car=5,
	truck=5,
	bus=5,
	trailer=5,
	construction_vehicle=5,
	traffic_cone=5,
	barrier=5,
	motorcycle=5,
	bicycle=5,
	pedestrian=5)),
	classes=class_names,
	sample_groups=dict(
	car=2,
	truck=3,
	construction_vehicle=7,
	bus=4,
	trailer=6,
	barrier=2,
	motorcycle=6,
	bicycle=6,
	pedestrian=2,
	traffic_cone=2),
	points_loader=dict(
	type='LoadPointsFromFile',
	coord_type='LIDAR',
	load_dim=5,
	use_dim=[0, 1, 2, 3, 4],
	backend_args=backend_args),
	backend_args=backend_args)
	ida_aug_conf = {
	'resize_lim': (0.47, 0.625),
	'final_dim': (320, 800),
	'bot_pct_lim': (0.0, 0.0),
	'rot_lim': (0.0, 0.0),
	'H': 900,
	'W': 1600,
	'rand_flip': True,
	}

	train_pipeline = [
	dict(
	type='LoadMultiViewImageFromFiles',
	to_float32=True,
	backend_args=backend_args),
	dict(
	type='LoadAnnotations3D',
	with_bbox_3d=True,
	with_label_3d=True,
	with_attr_label=False),
	dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
	dict(type='ObjectNameFilter', classes=class_names),
	dict(
	type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True),
	dict(
	type='GlobalRotScaleTransImage',
	rot_range=[-0.3925, 0.3925],
	translation_std=[0, 0, 0],
	scale_ratio_range=[0.95, 1.05],
	reverse_angle=False,
	training=True),
	dict(
	type='Pack3DDetInputs',
	keys=[
	'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
	'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
	])
	]
	test_pipeline = [
	dict(
	type='LoadMultiViewImageFromFiles',
	to_float32=True,
	backend_args=backend_args),
	dict(
	type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf,
	training=False),
	dict(type='Pack3DDetInputs', keys=['img'])
	]

	train_dataloader = dict(
	batch_size=1,
	num_workers=4,
	dataset=dict(
	type=dataset_type,
	data_prefix=dict(
	pts='samples/LIDAR_TOP',
	CAM_FRONT='samples/CAM_FRONT',
	CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
	CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
	CAM_BACK='samples/CAM_BACK',
	CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
	CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
	pipeline=train_pipeline,
	box_type_3d='LiDAR',
	metainfo=metainfo,
	test_mode=False,
	modality=input_modality,
	use_valid_flag=True,
	backend_args=backend_args))
	test_dataloader = dict(
	dataset=dict(
	type=dataset_type,
	data_prefix=dict(
	pts='samples/LIDAR_TOP',
	CAM_FRONT='samples/CAM_FRONT',
	CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
	CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
	CAM_BACK='samples/CAM_BACK',
	CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
	CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
	pipeline=test_pipeline,
	box_type_3d='LiDAR',
	metainfo=metainfo,
	test_mode=True,
	modality=input_modality,
	use_valid_flag=True,
	backend_args=backend_args))
	val_dataloader = dict(
	dataset=dict(
	type=dataset_type,
	data_prefix=dict(
	pts='samples/LIDAR_TOP',
	CAM_FRONT='samples/CAM_FRONT',
	CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
	CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
	CAM_BACK='samples/CAM_BACK',
	CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
	CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
	pipeline=test_pipeline,
	box_type_3d='LiDAR',
	metainfo=metainfo,
	test_mode=True,
	modality=input_modality,
	use_valid_flag=True,
	backend_args=backend_args))

	# Different from original PETR:
	# We don't use special lr for image_backbone
	# This seems won't affect model performance
	optim_wrapper = dict(
	# TODO Add Amp
	# type='AmpOptimWrapper',
	# loss_scale='dynamic',
	optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
	paramwise_cfg=dict(custom_keys={
	'img_backbone': dict(lr_mult=0.1),
	}),
	clip_grad=dict(max_norm=35, norm_type=2))

	num_epochs = 24

	param_scheduler = [
	dict(
	type='LinearLR',
	start_factor=1.0 / 3,
	begin=0,
	end=500,
	by_epoch=False),
	dict(
	type='CosineAnnealingLR',
	# TODO Figure out what T_max
	T_max=num_epochs,
	by_epoch=True,
	)
	]

	train_cfg = dict(max_epochs=num_epochs, val_interval=num_epochs)

	find_unused_parameters = False

	# pretrain_path can be found here:
	# https://drive.google.com/file/d/1ABI5BoQCkCkP4B0pO5KBJ3Ni0tei0gZi/view
	load_from = '/mnt/d/fcos3d_vovnet_imgbackbone-remapped.pth'
	resume = False

	# --------------Original---------------
	# mAP: 0.3778
	# mATE: 0.7463
	# mASE: 0.2718
	# mAOE: 0.4883
	# mAVE: 0.9062
	# mAAE: 0.2123
	# NDS: 0.4264
	# Eval time: 242.1s

	# Per-class results:
	# Object Class AP ATE ASE AOE AVE AAE
	# car 0.556 0.555 0.153 0.091 0.917 0.216
	# truck 0.330 0.805 0.218 0.119 0.859 0.250
	# bus 0.412 0.789 0.205 0.162 2.067 0.337
	# trailer 0.221 0.976 0.233 0.663 0.797 0.146
	# construction_vehicle 0.094 1.096 0.493 1.145 0.190 0.349
	# pedestrian 0.453 0.688 0.289 0.636 0.549 0.235
	# motorcycle 0.368 0.690 0.256 0.622 1.417 0.149
	# bicycle 0.341 0.609 0.270 0.812 0.455 0.017
	# traffic_cone 0.531 0.582 0.320 nan nan nan
	# barrier 0.472 0.673 0.281 0.145 nan nan

	# --------------Refactored in mmdet3d v1.0---------------
	# mAP: 0.3827
	# mATE: 0.7375
	# mASE: 0.2703
	# mAOE: 0.4799
	# mAVE: 0.8699
	# mAAE: 0.2038
	# NDS: 0.4352
	# Eval time: 124.8s

	# Per-class results:
	# Object Class AP ATE ASE AOE AVE AAE
	# car 0.574 0.519 0.150 0.087 0.865 0.206
	# truck 0.349 0.773 0.213 0.117 0.855 0.220
	# bus 0.423 0.781 0.204 0.122 1.902 0.319
	# trailer 0.219 1.034 0.231 0.608 0.830 0.149
	# construction_vehicle 0.084 1.062 0.486 1.245 0.172 0.360
	# pedestrian 0.452 0.681 0.293 0.646 0.529 0.231
	# motorcycle 0.378 0.670 0.250 0.567 1.334 0.130
	# bicycle 0.347 0.639 0.264 0.788 0.472 0.016
	# traffic_cone 0.538 0.553 0.325 nan nan nan
	# barrier 0.464 0.662 0.287 0.137 nan nan

	# --------------Refactored in mmdet3d v1.1---------------
	# mAP: 0.3830
	# mATE: 0.7547
	# mASE: 0.2683
	# mAOE: 0.4948
	# mAVE: 0.8331
	# mAAE: 0.2056
	# NDS: 0.4358
	# Eval time: 118.7s

	# Per-class results:
	# Object Class AP ATE ASE AOE AVE AAE
	# car 0.567 0.538 0.151 0.086 0.873 0.212
	# truck 0.341 0.785 0.213 0.113 0.821 0.234
	# bus 0.426 0.766 0.201 0.128 1.813 0.343
	# trailer 0.216 1.116 0.227 0.649 0.640 0.122
	# construction_vehicle 0.093 1.118 0.483 1.292 0.217 0.330
	# pedestrian 0.453 0.685 0.293 0.644 0.535 0.238
	# motorcycle 0.374 0.700 0.253 0.624 1.291 0.154
	# bicycle 0.345 0.622 0.262 0.775 0.475 0.011
	# traffic_cone 0.539 0.557 0.319 nan nan nan
	# barrier 0.476 0.661 0.279 0.142 nan nan