onescotch
add huggingface implementation
2de1f98
raw
history blame
No virus
8.7 kB
from config import cfg
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=10)
evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True)
optimizer = dict(
type='AdamW',
lr=1e-3,
weight_decay=1e-4,
paramwise_cfg = dict(
custom_keys={
# 'backbone': dict(lr_mult=0.1),
'sampling_offsets': dict(lr_mult=0.1),
'reference_points': dict(lr_mult=0.1),
# 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
},
)
)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[255, 310])
total_epochs = 325
log_config = dict(
interval=50, hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook'),
])
channel_cfg = dict(
num_output_channels=20,
dataset_joints=20,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
emb_dim = 256
if cfg.upscale==1:
neck_in_channels = [cfg.feat_dim]
num_levels = 1
elif cfg.upscale==2:
neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim]
# neck_in_channels = [768, 768]
num_levels = 2
elif cfg.upscale==4:
neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
# neck_in_channels = [768, 768, 768]
num_levels = 3
elif cfg.upscale==8:
neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
# neck_in_channels = [768, 768, 768, 768]
num_levels = 4
# model settings
norm_cfg = dict(type='BN', requires_grad=True)
# norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='Poseur',
pretrained='torchvision://resnet50',
backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)),
neck=dict(
type='ChannelMapper',
in_channels=neck_in_channels,
kernel_size=1,
out_channels=emb_dim,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
),
keypoint_head=dict(
type='Poseur_noise_sample',
in_channels=512,
num_queries=channel_cfg['num_output_channels'],
num_reg_fcs=2,
num_joints=channel_cfg['num_output_channels'],
with_box_refine=True,
loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
# loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5),
loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
# loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=emb_dim//2,
normalize=True,
offset=-0.5),
transformer=dict(
type='PoseurTransformer_v3',
num_joints=channel_cfg['num_output_channels'],
query_pose_emb = True,
embed_dims = emb_dim,
encoder=dict(
type='DetrTransformerEncoder_zero_layer',
num_layers=0,
transformerlayers=dict(
type='BaseTransformerLayer',
ffn_cfgs = dict(
embed_dims=emb_dim,
),
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
num_levels=num_levels,
num_points=4,
embed_dims=emb_dim),
feedforward_channels=1024,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DeformableDetrTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer_grouped',
ffn_cfgs = dict(
embed_dims=emb_dim,
),
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=emb_dim,
num_heads=8,
dropout=0.1),
dict(
type='MultiScaleDeformableAttention_post_value',
num_levels=num_levels,
num_points=4,
embed_dims=emb_dim)
],
feedforward_channels=1024,
num_joints=channel_cfg['num_output_channels'],
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
as_two_stage=True,
use_heatmap_loss=False,
),
train_cfg=dict(image_size=[192, 256]),
test_cfg = dict(
image_size=[192, 256],
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11)
)
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
# use_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
det_bbox_thr=0.0,
# use_gt_bbox=True,
# bbox_file='',
use_gt_bbox=False,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
# dict(
# type='TopDownGenerateTarget',
# kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
# encoding='Megvii'),
dict(
target_type='wo_mask',
type='TopDownGenerateCoordAndHeatMapTarget',
encoding='MSRA',
sigma=2),
dict(
type='Collect',
keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=[
'img',
],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
data = dict(
samples_per_gpu=32,
# samples_per_gpu=64,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
train=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
# ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
# img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=train_pipeline),
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline),
)
fp16 = dict(loss_scale='dynamic')