from config import cfg log_level = 'INFO' load_from = None resume_from = None dist_params = dict(backend='nccl') workflow = [('train', 1)] checkpoint_config = dict(interval=10) evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True) optimizer = dict( type='AdamW', lr=1e-3, weight_decay=1e-4, paramwise_cfg = dict( custom_keys={ # 'backbone': dict(lr_mult=0.1), 'sampling_offsets': dict(lr_mult=0.1), 'reference_points': dict(lr_mult=0.1), # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0), }, ) ) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[255, 310]) total_epochs = 325 log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook'), ]) channel_cfg = dict( num_output_channels=20, dataset_joints=20, dataset_channel=[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], ], inference_channel=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]) emb_dim = 256 if cfg.upscale==1: neck_in_channels = [cfg.feat_dim] num_levels = 1 elif cfg.upscale==2: neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim] # neck_in_channels = [768, 768] num_levels = 2 elif cfg.upscale==4: neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim] # neck_in_channels = [768, 768, 768] num_levels = 3 elif cfg.upscale==8: neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim] # neck_in_channels = [768, 768, 768, 768] num_levels = 4 # model settings norm_cfg = dict(type='BN', requires_grad=True) # norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='Poseur', pretrained='torchvision://resnet50', backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)), neck=dict( type='ChannelMapper', in_channels=neck_in_channels, kernel_size=1, out_channels=emb_dim, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), ), keypoint_head=dict( type='Poseur_noise_sample', in_channels=512, num_queries=channel_cfg['num_output_channels'], num_reg_fcs=2, num_joints=channel_cfg['num_output_channels'], with_box_refine=True, loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True), loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True), # loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5), loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10), # loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1), positional_encoding=dict( type='SinePositionalEncoding', num_feats=emb_dim//2, normalize=True, offset=-0.5), transformer=dict( type='PoseurTransformer_v3', num_joints=channel_cfg['num_output_channels'], query_pose_emb = True, embed_dims = emb_dim, encoder=dict( type='DetrTransformerEncoder_zero_layer', num_layers=0, transformerlayers=dict( type='BaseTransformerLayer', ffn_cfgs = dict( embed_dims=emb_dim, ), attn_cfgs=dict( type='MultiScaleDeformableAttention', num_levels=num_levels, num_points=4, embed_dims=emb_dim), feedforward_channels=1024, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'ffn', 'norm'))), decoder=dict( type='DeformableDetrTransformerDecoder', num_layers=6, return_intermediate=True, transformerlayers=dict( type='DetrTransformerDecoderLayer_grouped', ffn_cfgs = dict( embed_dims=emb_dim, ), attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=emb_dim, num_heads=8, dropout=0.1), dict( type='MultiScaleDeformableAttention_post_value', num_levels=num_levels, num_points=4, embed_dims=emb_dim) ], feedforward_channels=1024, num_joints=channel_cfg['num_output_channels'], ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')))), as_two_stage=True, use_heatmap_loss=False, ), train_cfg=dict(image_size=[192, 256]), test_cfg = dict( image_size=[192, 256], flip_test=True, post_process='default', shift_heatmap=True, modulate_kernel=11) ) data_cfg = dict( image_size=[192, 256], heatmap_size=[48, 64], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], inference_channel=channel_cfg['inference_channel'], soft_nms=False, # use_nms=False, nms_thr=1.0, oks_thr=0.9, vis_thr=0.2, det_bbox_thr=0.0, # use_gt_bbox=True, # bbox_file='', use_gt_bbox=False, bbox_file='data/coco/person_detection_results/' 'COCO_val2017_detections_AP_H_56_person.json', ) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownRandomFlip', flip_prob=0.5), dict( type='TopDownHalfBodyTransform', num_joints_half_body=8, prob_half_body=0.3), dict( type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # dict( # type='TopDownGenerateTarget', # kernel=[(11, 11), (9, 9), (7, 7), (5, 5)], # encoding='Megvii'), dict( target_type='wo_mask', type='TopDownGenerateCoordAndHeatMapTarget', encoding='MSRA', sigma=2), dict( type='Collect', keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] val_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( type='Collect', keys=[ 'img', ], meta_keys=[ 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] test_pipeline = val_pipeline data_root = 'data/coco' data = dict( samples_per_gpu=32, # samples_per_gpu=64, workers_per_gpu=8, val_dataloader=dict(samples_per_gpu=32), test_dataloader=dict(samples_per_gpu=32), train=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', img_prefix=f'{data_root}/train2017/', # ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', # img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg, pipeline=train_pipeline), val=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg, pipeline=val_pipeline), test=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg, pipeline=val_pipeline), ) fp16 = dict(loss_scale='dynamic')