guanxiongsun
/

vfe.pytorch

Model card Files Files and versions Community

vfe.pytorch / work_dirs /stpn_swint_adam_9x /stpn_swint_adam_9x.py

guanxiongsun

stpn

0ebd6fe 4 months ago

raw history blame contribute delete

No virus

17.6 kB

	checkpoint_config = dict(interval=9)
	log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
	custom_hooks = [dict(type='NumClassCheckHook')]
	dist_params = dict(backend='nccl')
	log_level = 'INFO'
	load_from = None
	resume_from = None
	workflow = [('train', 1)]
	optimizer = dict(
	type='AdamW',
	lr=2.5e-05,
	betas=(0.9, 0.999),
	weight_decay=0.05,
	paramwise_cfg=dict(
	custom_keys=dict(
	absolute_pos_embed=dict(decay_mult=0.0),
	relative_position_bias_table=dict(decay_mult=0.0),
	norm=dict(decay_mult=0.0))))
	optimizer_config = dict(grad_clip=None)
	lr_config = dict(
	policy='step',
	warmup='linear',
	warmup_iters=500,
	warmup_ratio=0.3333333333333333,
	step=[6])
	runner = dict(type='EpochBasedRunner', max_epochs=9)
	pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
	is_video_model = True
	model = dict(
	type='STPN',
	detector=dict(
	type='FasterRCNN',
	backbone=dict(
	type='STPNSwinTransformer',
	embed_dims=96,
	depths=[2, 2, 6, 2],
	num_heads=[3, 6, 12, 24],
	window_size=7,
	mlp_ratio=4,
	qkv_bias=True,
	qk_scale=None,
	drop_rate=0.0,
	attn_drop_rate=0.0,
	drop_path_rate=0.2,
	patch_norm=True,
	with_cp=False,
	convert_weights=True,
	init_cfg=dict(
	type='Pretrained',
	checkpoint=
	'https://github.com/SwinTransformer/storage/releases/download/v1.0.2/mask_rcnn_swin_tiny_patch4_window7.pth'
	),
	prompt_cfg=dict(
	num_tokens=5,
	location='prepend',
	deep=False,
	dropout=0.0,
	initiation='random')),
	neck=dict(
	type='FPN',
	in_channels=[96, 192, 384, 768],
	out_channels=256,
	num_outs=5),
	rpn_head=dict(
	type='RPNHead',
	in_channels=256,
	feat_channels=256,
	anchor_generator=dict(
	type='AnchorGenerator',
	scales=[8],
	ratios=[0.5, 1.0, 2.0],
	strides=[4, 8, 16, 32, 64]),
	bbox_coder=dict(
	type='DeltaXYWHBBoxCoder',
	target_means=[0.0, 0.0, 0.0, 0.0],
	target_stds=[1.0, 1.0, 1.0, 1.0]),
	loss_cls=dict(
	type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
	loss_bbox=dict(
	type='SmoothL1Loss', beta=0.1111111111111111,
	loss_weight=1.0)),
	roi_head=dict(
	type='StandardRoIHead',
	bbox_roi_extractor=dict(
	type='SingleRoIExtractor',
	roi_layer=dict(
	type='RoIAlign', output_size=7, sampling_ratio=0),
	out_channels=256,
	featmap_strides=[4, 8, 16, 32]),
	bbox_head=dict(
	type='Shared2FCBBoxHead',
	in_channels=256,
	fc_out_channels=1024,
	roi_feat_size=7,
	num_classes=30,
	bbox_coder=dict(
	type='DeltaXYWHBBoxCoder',
	target_means=[0.0, 0.0, 0.0, 0.0],
	target_stds=[0.2, 0.2, 0.2, 0.2]),
	reg_class_agnostic=False,
	loss_cls=dict(
	type='CrossEntropyLoss',
	use_sigmoid=False,
	loss_weight=1.0),
	loss_bbox=dict(
	type='SmoothL1Loss',
	beta=0.1111111111111111,
	loss_weight=1.0))),
	train_cfg=dict(
	rpn=dict(
	assigner=dict(
	type='MaxIoUAssigner',
	pos_iou_thr=0.7,
	neg_iou_thr=0.3,
	min_pos_iou=0.3,
	match_low_quality=True,
	ignore_iof_thr=-1),
	sampler=dict(
	type='RandomSampler',
	num=256,
	pos_fraction=0.5,
	neg_pos_ub=-1,
	add_gt_as_proposals=False),
	allowed_border=-1,
	pos_weight=-1,
	debug=False),
	rpn_proposal=dict(
	nms_pre=1000,
	max_per_img=300,
	nms=dict(type='nms', iou_threshold=0.7),
	min_bbox_size=0),
	rcnn=dict(
	assigner=dict(
	type='MaxIoUAssigner',
	pos_iou_thr=0.5,
	neg_iou_thr=0.5,
	min_pos_iou=0.5,
	match_low_quality=True,
	ignore_iof_thr=-1),
	sampler=dict(
	type='RandomSampler',
	num=256,
	pos_fraction=0.25,
	neg_pos_ub=-1,
	add_gt_as_proposals=True),
	mask_size=28,
	pos_weight=-1,
	debug=False)),
	test_cfg=dict(
	rpn=dict(
	nms_pre=1000,
	max_per_img=300,
	nms=dict(type='nms', iou_threshold=0.7),
	min_bbox_size=0),
	rcnn=dict(
	score_thr=0.0001,
	nms=dict(type='nms', iou_threshold=0.5),
	max_per_img=100,
	mask_thr_binary=0.5))))
	dataset_type = 'ImagenetVIDDataset'
	data_root = 'data/ILSVRC/'
	img_norm_cfg = dict(
	mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
	train_pipeline = [
	dict(type='LoadMultiImagesFromFile'),
	dict(type='SeqLoadAnnotations', with_bbox=True, with_mask=False),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
	dict(
	type='AutoAugment',
	policies=[[{
	'type':
	'SeqResize',
	'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333),
	(608, 1333), (640, 1333), (672, 1333), (704, 1333),
	(736, 1333), (768, 1333), (800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}],
	[{
	'type': 'SeqResize',
	'img_scale': [(400, 1333), (500, 1333), (600, 1333)],
	'multiscale_mode': 'value',
	'keep_ratio': True
	}, {
	'type': 'SeqRandomCrop',
	'crop_type': 'absolute_range',
	'crop_size': (384, 600),
	'allow_negative_crop': True
	}, {
	'type': 'SeqMaxSizePad'
	}, {
	'type':
	'SeqResize2',
	'img_scale': [(480, 1333), (512, 1333), (544, 1333),
	(576, 1333), (608, 1333), (640, 1333),
	(672, 1333), (704, 1333), (736, 1333),
	(768, 1333), (800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}]]),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(type='VideoCollect', keys=['img', 'gt_bboxes', 'gt_labels']),
	dict(type='ConcatVideoReferences'),
	dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
	]
	test_pipeline = [
	dict(type='LoadMultiImagesFromFile'),
	dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(
	type='VideoCollect',
	keys=['img'],
	meta_keys=('num_left_ref_imgs', 'frame_stride')),
	dict(type='ConcatVideoReferences'),
	dict(type='MultiImagesToTensor', ref_prefix='ref'),
	dict(type='ToList')
	]
	data = dict(
	samples_per_gpu=1,
	workers_per_gpu=4,
	train=[
	dict(
	type='ImagenetVIDDataset',
	ann_file='data/ILSVRC/annotations/imagenet_vid_train.json',
	img_prefix='data/ILSVRC/Data/VID',
	ref_img_sampler=dict(
	num_ref_imgs=2,
	frame_range=9,
	filter_key_img=True,
	method='bilateral_uniform'),
	pipeline=[
	dict(type='LoadMultiImagesFromFile'),
	dict(
	type='SeqLoadAnnotations', with_bbox=True,
	with_mask=False),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
	dict(
	type='AutoAugment',
	policies=[[{
	'type':
	'SeqResize',
	'img_scale': [(480, 1333), (512, 1333), (544, 1333),
	(576, 1333), (608, 1333), (640, 1333),
	(672, 1333), (704, 1333), (736, 1333),
	(768, 1333), (800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}],
	[{
	'type':
	'SeqResize',
	'img_scale': [(400, 1333), (500, 1333),
	(600, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}, {
	'type': 'SeqRandomCrop',
	'crop_type': 'absolute_range',
	'crop_size': (384, 600),
	'allow_negative_crop': True
	}, {
	'type': 'SeqMaxSizePad'
	}, {
	'type':
	'SeqResize2',
	'img_scale': [(480, 1333), (512, 1333),
	(544, 1333), (576, 1333),
	(608, 1333), (640, 1333),
	(672, 1333), (704, 1333),
	(736, 1333), (768, 1333),
	(800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}]]),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(
	type='VideoCollect',
	keys=['img', 'gt_bboxes', 'gt_labels']),
	dict(type='ConcatVideoReferences'),
	dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
	]),
	dict(
	type='ImagenetVIDDataset',
	load_as_video=False,
	ann_file='data/ILSVRC/annotations/imagenet_det_30plus1cls.json',
	img_prefix='data/ILSVRC/Data/DET',
	ref_img_sampler=dict(
	num_ref_imgs=2,
	frame_range=0,
	filter_key_img=False,
	method='bilateral_uniform'),
	pipeline=[
	dict(type='LoadMultiImagesFromFile'),
	dict(
	type='SeqLoadAnnotations', with_bbox=True,
	with_mask=False),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
	dict(
	type='AutoAugment',
	policies=[[{
	'type':
	'SeqResize',
	'img_scale': [(480, 1333), (512, 1333), (544, 1333),
	(576, 1333), (608, 1333), (640, 1333),
	(672, 1333), (704, 1333), (736, 1333),
	(768, 1333), (800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}],
	[{
	'type':
	'SeqResize',
	'img_scale': [(400, 1333), (500, 1333),
	(600, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}, {
	'type': 'SeqRandomCrop',
	'crop_type': 'absolute_range',
	'crop_size': (384, 600),
	'allow_negative_crop': True
	}, {
	'type': 'SeqMaxSizePad'
	}, {
	'type':
	'SeqResize2',
	'img_scale': [(480, 1333), (512, 1333),
	(544, 1333), (576, 1333),
	(608, 1333), (640, 1333),
	(672, 1333), (704, 1333),
	(736, 1333), (768, 1333),
	(800, 1333)],
	'multiscale_mode':
	'value',
	'keep_ratio':
	True
	}]]),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(
	type='VideoCollect',
	keys=['img', 'gt_bboxes', 'gt_labels']),
	dict(type='ConcatVideoReferences'),
	dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
	])
	],
	val=dict(
	type='ImagenetVIDDataset',
	ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
	img_prefix='data/ILSVRC/Data/VID',
	ref_img_sampler=dict(
	num_ref_imgs=14,
	frame_range=[-7, 7],
	method='test_with_adaptive_stride'),
	pipeline=[
	dict(type='LoadMultiImagesFromFile'),
	dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(
	type='VideoCollect',
	keys=['img'],
	meta_keys=('num_left_ref_imgs', 'frame_stride')),
	dict(type='ConcatVideoReferences'),
	dict(type='MultiImagesToTensor', ref_prefix='ref'),
	dict(type='ToList')
	],
	test_mode=True),
	test=dict(
	type='ImagenetVIDDataset',
	ann_file='data/ILSVRC/annotations/imagenet_vid_val.json',
	img_prefix='data/ILSVRC/Data/VID',
	ref_img_sampler=dict(
	num_ref_imgs=14,
	frame_range=[-7, 7],
	method='test_with_adaptive_stride'),
	pipeline=[
	dict(type='LoadMultiImagesFromFile'),
	dict(type='SeqResize', img_scale=(1000, 600), keep_ratio=True),
	dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.0),
	dict(
	type='SeqNormalize',
	mean=[123.675, 116.28, 103.53],
	std=[58.395, 57.12, 57.375],
	to_rgb=True),
	dict(type='SeqPad', size_divisor=16),
	dict(
	type='VideoCollect',
	keys=['img'],
	meta_keys=('num_left_ref_imgs', 'frame_stride')),
	dict(type='ConcatVideoReferences'),
	dict(type='MultiImagesToTensor', ref_prefix='ref'),
	dict(type='ToList')
	],
	test_mode=True))
	total_epochs = 9
	evaluation = dict(metric=['bbox'], vid_style=True, interval=9)
	work_dir = './work_dirs/stpn_swint_adam_9x'
	gpu_ids = range(0, 8)