Spaces:

DeclK
/

pose

Running

App Files Files Community

DeclK commited on Apr 30, 2023

Commit

6ed2820

1 Parent(s): da31fac

update

Browse files

Files changed (20) hide show

.gitignore +7 -0
app.py +109 -0
assets/logo.png +0 -0
assets/onnx_test.jpg +0 -0
configs/mark1.py +9 -0
configs/mark2.py +10 -0
convert_det.sh +8 -0
main.py +96 -0
model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/detection_onnxruntime_static.py +23 -0
model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco.py +345 -0
model_zoo/rtmpose/rtmpose-m_8xb256-420e_aic-coco-256x192/rtmpose-m_8xb256-420e_aic-coco-256x192.py +391 -0
model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-t_8xb256-420e_aic-coco-256x192.py +385 -0
requirements.txt +12 -0
tools/apis.py +90 -0
tools/deploy.py +236 -0
tools/dtw.py +116 -0
tools/inferencer.py +154 -0
tools/manager.py +72 -0
tools/utils.py +120 -0
tools/visualizer.py +346 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.pth
+*.pkl
+*.mp4
+*.onnx
+*.ttf
+tempt*
+**pycache**

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Inference 2 videos and use dtw to match the pose keypoints.
+from tools.inferencer import PoseInferencerV2
+from tools.dtw import DTWForKeypoints
+from tools.visualizer import FastVisualizer
+from tools.utils import convert_video_to_playable_mp4
+from argparse import ArgumentParser
+from pathlib import Path
+from tqdm import tqdm
+import mmengine
+import numpy as np
+import mmcv
+import cv2
+import gradio as gr
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('--config', type=str, default='configs/mark2.py')
+    parser.add_argument('--video1', type=str, default='assets/tennis1.mp4')
+    parser.add_argument('--video2', type=str, default='assets/tennis2.mp4')
+    return parser.parse_args()
+def concat(img1, img2, height=1080):
+    w1, h1, _ = img1.shape
+    w2, h2, _ = img2.shape
+    # Calculate the scaling factor for each image
+    scale1 = height / img1.shape[1]
+    scale2 = height / img2.shape[1]
+    # Resize the images
+    img1 = cv2.resize(img1, (int(h1*scale1), int(w1*scale1)))
+    img2 = cv2.resize(img2, (int(h2*scale2), int(w2*scale2)))
+    # Concatenate the images horizontally
+    image = cv2.hconcat([img1, img2])
+    return image
+def draw(vis: FastVisualizer, img, keypoint, box, oks, oks_unnorm, draw_score_bar=True):
+    vis.set_image(img)
+    vis.draw_non_transparent_area(box)
+    if draw_score_bar:
+        vis.draw_score_bar(oks)
+    vis.draw_human_keypoints(keypoint, oks_unnorm)
+    return vis.get_image()
+def main(video1, video2):
+    # build PoseInferencerV2
+    config = 'configs/mark2.py'
+    cfg = mmengine.Config.fromfile(config)
+    pose_inferencer = PoseInferencerV2(
+                        cfg.det_cfg,
+                        cfg.pose_cfg,
+                        device='cpu')
+    v1 = mmcv.VideoReader(video1)
+    v2 = mmcv.VideoReader(video2)
+    video_writer = None
+    all_det1, all_pose1 = pose_inferencer.inference_video(video1)
+    all_det2, all_pose2 = pose_inferencer.inference_video(video2)
+    keypoints1 = np.stack([p.keypoints[0] for p in all_pose1])  # forced the first pred
+    keypoints2 = np.stack([p.keypoints[0] for p in all_pose2])
+    boxes1 = np.stack([d.bboxes[0] for d in all_det1])
+    boxes2 = np.stack([d.bboxes[0] for d in all_det2])
+    dtw_path, oks, oks_unnorm = DTWForKeypoints(keypoints1, keypoints2).get_dtw_path()
+    vis = FastVisualizer()
+    for i, j in tqdm(dtw_path):
+        frame1 = v1[i]
+        frame2 = v2[j]
+        frame1_ = draw(vis, frame1.copy(), keypoints1[i], boxes1[i],
+                       oks[i, j], oks_unnorm[i, j])
+        frame2_ = draw(vis, frame2.copy(), keypoints2[j], boxes2[j],
+                       oks[i, j], oks_unnorm[i, j], draw_score_bar=False)
+        # concate two frames
+        frame = concat(frame1_, frame2_)
+        # draw logo
+        vis.set_image(frame)
+        frame = vis.draw_logo().get_image()
+        # write video
+        w, h = frame.shape[1], frame.shape[0]
+        if video_writer is None:
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            video_writer = cv2.VideoWriter('dtw_compare.mp4',
+                                            fourcc, v1.fps, (w, h))
+        video_writer.write(frame)
+    video_writer.release()
+    # output video file
+    convert_video_to_playable_mp4('dtw_compare.mp4')
+    output = str(Path('dtw_compare.mp4').resolve())
+    return output
+if __name__ == '__main__':
+    config = 'configs/mark2.py'
+    cfg = mmengine.Config.fromfile(config)
+    inputs = [
+        gr.Video(label="Input video 1"),
+        gr.Video(label="Input video 2")
+    ]
+    output = gr.Video(label="Output video")
+    demo = gr.Interface(fn=main, inputs=inputs, outputs=output).queue()
+    demo.launch(share=True)

assets/logo.png ADDED Viewed

assets/onnx_test.jpg ADDED Viewed

configs/mark1.py ADDED Viewed

	@@ -0,0 +1,9 @@

+det_cfg = dict(
+    model_cfg='model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco.py',
+    model_ckpt='/github/Tennis.ai/model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth'
+)
+pose_cfg = dict(
+    model_cfg='model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-t_8xb256-420e_aic-coco-256x192.py',
+    model_ckpt='model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth'
+)

configs/mark2.py ADDED Viewed

	@@ -0,0 +1,10 @@

+det_cfg = dict(
+    deploy_cfg='model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/detection_onnxruntime_static.py',
+    model_cfg='model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco.py',
+    backend_files=['model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/end2end.onnx']
+)
+pose_cfg = dict(
+    model_cfg='model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-t_8xb256-420e_aic-coco-256x192.py',
+    model_ckpt='model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth'
+)

convert_det.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+python tools/deploy.py \
+    model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/detection_onnxruntime_static.py \
+    model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco.py \
+    model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth \
+    assets/onnx_test.jpg \
+    --work-dir model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco \
+    --device cpu \
+    --show

main.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Inference 2 videos and use dtw to match the pose keypoints.
+from tools.inferencer import PoseInferencerV2
+from tools.dtw import DTWForKeypoints
+from tools.visualizer import FastVisualizer
+from argparse import ArgumentParser
+from tools.utils import convert_video_to_playable_mp4
+from tqdm import tqdm
+import mmengine
+import numpy as np
+import mmcv
+import cv2
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('--config', type=str, default='configs/mark2.py')
+    parser.add_argument('--video1', type=str, default='assets/tennis1.mp4')
+    parser.add_argument('--video2', type=str, default='assets/tennis2.mp4')
+    return parser.parse_args()
+def concat(img1, img2, height=1080):
+    w1, h1, _ = img1.shape
+    w2, h2, _ = img2.shape
+    # Calculate the scaling factor for each image
+    scale1 = height / img1.shape[1]
+    scale2 = height / img2.shape[1]
+    # Resize the images
+    img1 = cv2.resize(img1, (int(h1*scale1), int(w1*scale1)))
+    img2 = cv2.resize(img2, (int(h2*scale2), int(w2*scale2)))
+    # Concatenate the images horizontally
+    image = cv2.hconcat([img1, img2])
+    return image
+def draw(vis: FastVisualizer, img, keypoint, box, oks, oks_unnorm, draw_score_bar=True):
+    vis.set_image(img)
+    vis.draw_non_transparent_area(box)
+    if draw_score_bar:
+        vis.draw_score_bar(oks)
+    vis.draw_human_keypoints(keypoint, oks_unnorm)
+    return vis.get_image()
+def main(cfg):
+    # build PoseInferencerV2
+    pose_inferencer = PoseInferencerV2(
+                        cfg.det_cfg,
+                        cfg.pose_cfg,
+                        device='cpu')
+    v1 = mmcv.VideoReader(cfg.video1)
+    v2 = mmcv.VideoReader(cfg.video2)
+    video_writer = None
+    all_det1, all_pose1 = pose_inferencer.inference_video(cfg.video1)
+    all_det2, all_pose2 = pose_inferencer.inference_video(cfg.video2)
+    keypoints1 = np.stack([p.keypoints[0] for p in all_pose1])  # forced the first pred
+    keypoints2 = np.stack([p.keypoints[0] for p in all_pose2])
+    boxes1 = np.stack([d.bboxes[0] for d in all_det1])
+    boxes2 = np.stack([d.bboxes[0] for d in all_det2])
+    dtw_path, oks, oks_unnorm = DTWForKeypoints(keypoints1, keypoints2).get_dtw_path()
+    vis = FastVisualizer()
+    for i, j in tqdm(dtw_path):
+        frame1 = v1[i]
+        frame2 = v2[j]
+        frame1_ = draw(vis, frame1.copy(), keypoints1[i], boxes1[i],
+                       oks[i, j], oks_unnorm[i, j])
+        frame2_ = draw(vis, frame2.copy(), keypoints2[j], boxes2[j],
+                       oks[i, j], oks_unnorm[i, j], draw_score_bar=False)
+        # concate two frames
+        frame = concat(frame1_, frame2_)
+        # draw logo
+        vis.set_image(frame)
+        frame = vis.draw_logo().get_image()
+        # write video
+        w, h = frame.shape[1], frame.shape[0]
+        if video_writer is None:
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            video_writer = cv2.VideoWriter('dtw_compare.mp4',
+                                            fourcc, v1.fps, (w, h))
+        video_writer.write(frame)
+    video_writer.release()
+    convert_video_to_playable_mp4('dtw_compare.mp4')
+if __name__ == '__main__':
+    args = parse_args()
+    cfg = mmengine.Config.fromfile(args.config)
+    cfg.video1 = args.video1
+    cfg.video2 = args.video2
+    main(cfg)

model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/detection_onnxruntime_static.py ADDED Viewed

	@@ -0,0 +1,23 @@

+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['dets', 'labels'],
+    input_shape=None,
+    optimize=True)
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1))
+backend_config = dict(type='onnxruntime')

model_zoo/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco.py ADDED Viewed

	@@ -0,0 +1,345 @@

+default_scope = 'mmdet'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+log_level = 'INFO'
+load_from = None
+resume = False
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=300,
+    val_interval=10,
+    dynamic_intervals=[(280, 1)])
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=150,
+        end=300,
+        T_max=150,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='CachedMosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='Resize', scale=(640, 640), keep_ratio=True),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=None,
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='CachedMosaic',
+                img_scale=(640, 640),
+                pad_val=114.0,
+                max_cached_images=20,
+                random_pop=False),
+            dict(
+                type='RandomResize',
+                scale=(1280, 1280),
+                ratio_range=(0.5, 2.0),
+                keep_ratio=True),
+            dict(type='RandomCrop', crop_size=(640, 640)),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(type='RandomFlip', prob=0.5),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='CachedMixUp',
+                img_scale=(640, 640),
+                ratio_range=(1.0, 1.0),
+                max_cached_images=10,
+                random_pop=False,
+                pad_val=(114, 114, 114),
+                prob=0.5),
+            dict(type='PackDetInputs')
+        ],
+        backend_args=None),
+    pin_memory=True)
+val_dataloader = dict(
+    batch_size=5,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(640, 640), keep_ratio=True),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+test_dataloader = dict(
+    batch_size=5,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='Resize', scale=(640, 640), keep_ratio=True),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(
+                type='PackDetInputs',
+                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                           'scale_factor'))
+        ],
+        backend_args=None))
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None,
+    proposal_nums=(100, 1, 10))
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=None,
+    proposal_nums=(100, 1, 10))
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100))
+img_scales = [(640, 640), (320, 320), (960, 960)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[[{
+            'type': 'Resize',
+            'scale': (640, 640),
+            'keep_ratio': True
+        }, {
+            'type': 'Resize',
+            'scale': (320, 320),
+            'keep_ratio': True
+        }, {
+            'type': 'Resize',
+            'scale': (960, 960),
+            'keep_ratio': True
+        }],
+                    [{
+                        'type': 'RandomFlip',
+                        'prob': 1.0
+                    }, {
+                        'type': 'RandomFlip',
+                        'prob': 0.0
+                    }],
+                    [{
+                        'type': 'Pad',
+                        'size': (960, 960),
+                        'pad_val': {
+                            'img': (114, 114, 114)
+                        }
+                    }],
+                    [{
+                        'type':
+                        'PackDetInputs',
+                        'meta_keys':
+                        ('img_id', 'img_path', 'ori_shape', 'img_shape',
+                         'scale_factor', 'flip', 'flip_direction')
+                    }]])
+]
+model = dict(
+    type='RTMDet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        batch_augments=None),
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.167,
+        widen_factor=0.375,
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint=
+            'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'
+        )),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[96, 192, 384],
+        out_channels=96,
+        num_csp_blocks=1,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='RTMDetSepBNHead',
+        num_classes=80,
+        in_channels=96,
+        stacked_convs=2,
+        feat_channels=96,
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        with_objectness=False,
+        exp_on_reg=False,
+        share_conv=True,
+        pred_kernel_size=1,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    train_cfg=dict(
+        assigner=dict(type='DynamicSoftLabelAssigner', topk=13),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=30000,
+        min_bbox_size=0,
+        score_thr=0.001,
+        nms=dict(type='nms', iou_threshold=0.65),
+        max_per_img=300))
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+max_epochs = 300
+stage2_num_epochs = 20
+base_lr = 0.004
+interval = 10
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=[
+            dict(type='LoadImageFromFile', backend_args=None),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                type='RandomResize',
+                scale=(640, 640),
+                ratio_range=(0.5, 2.0),
+                keep_ratio=True),
+            dict(type='RandomCrop', crop_size=(640, 640)),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(type='RandomFlip', prob=0.5),
+            dict(
+                type='Pad', size=(640, 640),
+                pad_val=dict(img=(114, 114, 114))),
+            dict(type='PackDetInputs')
+        ])
+]
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'

model_zoo/rtmpose/rtmpose-m_8xb256-420e_aic-coco-256x192/rtmpose-m_8xb256-420e_aic-coco-256x192.py ADDED Viewed

	@@ -0,0 +1,391 @@

+default_scope = 'mmpose'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=10,
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='PoseVisualizationHook', enable=False))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=390,
+        switch_pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='RandomHalfBody'),
+            dict(
+                type='RandomBBoxTransform',
+                shift_factor=0.0,
+                scale_factor=[0.75, 1.25],
+                rotate_factor=60),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=0.5)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(192, 256),
+                    sigma=(4.9, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ])
+]
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(
+    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+backend_args = dict(backend='local')
+train_cfg = dict(by_epoch=True, max_epochs=420, val_interval=10)
+val_cfg = dict()
+test_cfg = dict()
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 0.004
+randomness = dict(seed=21)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=210,
+        end=420,
+        T_max=210,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+auto_scale_lr = dict(base_batch_size=1024)
+codec = dict(
+    type='SimCCLabel',
+    input_size=(192, 256),
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.67,
+        widen_factor=0.75,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint=
+            'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth'
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=768,
+        out_channels=17,
+        input_size=(192, 256),
+        in_featuremap_size=(6, 8),
+        simcc_split_ratio=2.0,
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.0,
+            drop_path=0.0,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.0,
+            label_softmax=True),
+        decoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    test_cfg=dict(flip_test=True))
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+train_pipeline = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.0,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+dataset_coco = dict(
+    type='RepeatDataset',
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='detection/coco/train2017/'),
+        pipeline=[]),
+    times=3)
+dataset_aic = dict(
+    type='AicDataset',
+    data_root='data/',
+    data_mode='topdown',
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(
+        img=
+        'pose/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/'
+    ),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=17,
+            mapping=[(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+                     (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)])
+    ])
+train_dataloader = dict(
+    batch_size=256,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+        datasets=[
+            dict(
+                type='RepeatDataset',
+                dataset=dict(
+                    type='CocoDataset',
+                    data_root='data/',
+                    data_mode='topdown',
+                    ann_file='coco/annotations/person_keypoints_train2017.json',
+                    data_prefix=dict(img='detection/coco/train2017/'),
+                    pipeline=[]),
+                times=3),
+            dict(
+                type='AicDataset',
+                data_root='data/',
+                data_mode='topdown',
+                ann_file='aic/annotations/aic_train.json',
+                data_prefix=dict(
+                    img=
+                    'pose/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/'
+                ),
+                pipeline=[
+                    dict(
+                        type='KeypointConverter',
+                        num_keypoints=17,
+                        mapping=[(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+                                 (5, 9), (6, 12), (7, 14), (8, 16), (9, 11),
+                                 (10, 13), (11, 15)])
+                ])
+        ],
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='RandomHalfBody'),
+            dict(
+                type='RandomBBoxTransform',
+                scale_factor=[0.6, 1.4],
+                rotate_factor=80),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=1.0)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(192, 256),
+                    sigma=(4.9, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ],
+        test_mode=False))
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='detection/coco/val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+test_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='detection/coco/val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/person_keypoints_val2017.json')
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/person_keypoints_val2017.json')

model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-t_8xb256-420e_aic-coco-256x192.py ADDED Viewed

	@@ -0,0 +1,385 @@

+default_scope = 'mmpose'
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=10,
+        save_best='coco/AP',
+        rule='greater',
+        max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='PoseVisualizationHook', enable=False))
+custom_hooks = [
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=390,
+        switch_pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='RandomHalfBody'),
+            dict(
+                type='RandomBBoxTransform',
+                shift_factor=0.0,
+                scale_factor=[0.75, 1.25],
+                rotate_factor=60),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=0.5)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(192, 256),
+                    sigma=(4.9, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ])
+]
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'))
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[dict(type='LocalVisBackend')],
+    name='visualizer')
+log_processor = dict(
+    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+backend_args = dict(backend='local')
+train_cfg = dict(by_epoch=True, max_epochs=420, val_interval=10)
+val_cfg = dict()
+test_cfg = dict()
+max_epochs = 420
+stage2_num_epochs = 30
+base_lr = 0.004
+randomness = dict(seed=21)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.0),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-05, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0.0002,
+        begin=210,
+        end=420,
+        T_max=210,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+auto_scale_lr = dict(base_batch_size=1024)
+codec = dict(
+    type='SimCCLabel',
+    input_size=(192, 256),
+    sigma=(4.9, 5.66),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=0.167,
+        widen_factor=0.375,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint=
+            'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth'
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=384,
+        out_channels=17,
+        input_size=(192, 256),
+        in_featuremap_size=(6, 8),
+        simcc_split_ratio=2.0,
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.0,
+            drop_path=0.0,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.0,
+            label_softmax=True),
+        decoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    test_cfg=dict(flip_test=True))
+dataset_type = 'CocoDataset'
+data_mode = 'topdown'
+data_root = 'data/'
+train_pipeline = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=dict(backend='local')),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.0,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=(192, 256)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5)
+        ]),
+    dict(
+        type='GenerateTarget',
+        encoder=dict(
+            type='SimCCLabel',
+            input_size=(192, 256),
+            sigma=(4.9, 5.66),
+            simcc_split_ratio=2.0,
+            normalize=False,
+            use_dark=False)),
+    dict(type='PackPoseInputs')
+]
+dataset_coco = dict(
+    type='RepeatDataset',
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_train2017.json',
+        data_prefix=dict(img='detection/coco/train2017/'),
+        pipeline=[]),
+    times=3)
+dataset_aic = dict(
+    type='AicDataset',
+    data_root='data/',
+    data_mode='topdown',
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(
+        img=
+        'pose/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/'
+    ),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=17,
+            mapping=[(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+                     (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)])
+    ])
+train_dataloader = dict(
+    batch_size=256,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco.py'),
+        datasets=[
+            dict(
+                type='RepeatDataset',
+                dataset=dict(
+                    type='CocoDataset',
+                    data_root='data/',
+                    data_mode='topdown',
+                    ann_file='coco/annotations/person_keypoints_train2017.json',
+                    data_prefix=dict(img='detection/coco/train2017/'),
+                    pipeline=[]),
+                times=3),
+            dict(
+                type='AicDataset',
+                data_root='data/',
+                data_mode='topdown',
+                ann_file='aic/annotations/aic_train.json',
+                data_prefix=dict(
+                    img=
+                    'pose/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/'
+                ),
+                pipeline=[
+                    dict(
+                        type='KeypointConverter',
+                        num_keypoints=17,
+                        mapping=[(0, 6), (1, 8), (2, 10), (3, 5), (4, 7),
+                                 (5, 9), (6, 12), (7, 14), (8, 16), (9, 11),
+                                 (10, 13), (11, 15)])
+                ])
+        ],
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='RandomFlip', direction='horizontal'),
+            dict(type='RandomHalfBody'),
+            dict(
+                type='RandomBBoxTransform',
+                scale_factor=[0.6, 1.4],
+                rotate_factor=80),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                type='Albumentation',
+                transforms=[
+                    dict(type='Blur', p=0.1),
+                    dict(type='MedianBlur', p=0.1),
+                    dict(
+                        type='CoarseDropout',
+                        max_holes=1,
+                        max_height=0.4,
+                        max_width=0.4,
+                        min_holes=1,
+                        min_height=0.2,
+                        min_width=0.2,
+                        p=1.0)
+                ]),
+            dict(
+                type='GenerateTarget',
+                encoder=dict(
+                    type='SimCCLabel',
+                    input_size=(192, 256),
+                    sigma=(4.9, 5.66),
+                    simcc_split_ratio=2.0,
+                    normalize=False,
+                    use_dark=False)),
+            dict(type='PackPoseInputs')
+        ],
+        test_mode=False))
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='detection/coco/val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+test_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/',
+        data_mode='topdown',
+        ann_file='coco/annotations/person_keypoints_val2017.json',
+        data_prefix=dict(img='detection/coco/val2017/'),
+        test_mode=True,
+        pipeline=[
+            dict(type='LoadImage', backend_args=dict(backend='local')),
+            dict(type='GetBBoxCenterScale'),
+            dict(type='TopdownAffine', input_size=(192, 256)),
+            dict(type='PackPoseInputs')
+        ]))
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/person_keypoints_val2017.json')
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file='data/coco/annotations/person_keypoints_val2017.json')

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+openmim
+torch
+mmengine
+mmcv
+mmdet
+mmpose
+mmdeploy
+onnxruntime
+tqdm
+scikit-image
+easydict
+gradio

tools/apis.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+from mmengine.registry import MODELS
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.config import Config
+from mmdeploy.utils import get_input_shape, load_config
+from mmdeploy.apis.utils import build_task_processor
+def build_model(cfg, checkpoint=None, device='cpu'):
+    """ Build model from config and load checkpoint
+    checkpoint_meta usually contains dataset classes information
+    """
+    if isinstance(cfg, str):
+        cfg = Config.fromfile(cfg)
+    # scope of model, e.g. mmdet, mmseg, mmpose...
+    init_default_scope(cfg.default_scope)
+    model = MODELS.build(cfg.model)
+    model = revert_sync_batchnorm(model)
+    if checkpoint is not None:
+        ckpt = load_checkpoint(model, checkpoint,
+                            map_location='cpu')
+        checkpoint_meta = ckpt.get('meta', {})
+        # usually classes and pallate are in checkpoint_meta
+        model.checkpoint_meta = checkpoint_meta
+    model.to(device)
+    model.eval()
+    return model
+def inference(model, cfg, img):
+    """ Given model, config and image, return inference results.
+    Models in mmlab does not share the same inference api. So this
+    function is just a memo for me...
+    """
+    if isinstance(cfg, str):
+        cfg = Config.fromfile(cfg)
+    # process pipline
+    test_pipeline = cfg.test_dataloader.dataset.pipeline
+    # Use 'LoadImage' to handle both cases of img and img_path
+    # This is specially designed for mmdet config, which uses 'LoadImageFromFile'
+    for pipeline in test_pipeline:
+        if 'LoadImage' in pipeline['type']:
+            pipeline['type'] = 'mmpose.LoadImage'
+    init_default_scope(cfg.default_scope)
+    pipeline = Compose(test_pipeline)
+    if isinstance(img, str):
+        # img_id is useless...but to be compatible with mmdet
+        data_info = dict(img_path=img, img_id=0)
+    else:
+        data_info = dict(img=img, img_id=0)
+    data = pipeline(data_info)
+    batch = pseudo_collate([data])
+    with torch.no_grad():
+        results = model.test_step(batch)
+    return results
+def build_onnx_model_and_task_processor(model_cfg, deploy_cfg, backend_files, device):
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+    model = task_processor.build_backend_model(
+        backend_files, task_processor.update_data_preprocessor)
+    return model, task_processor
+def inference_onnx_model(model, task_processor, deploy_cfg, img):
+    input_shape = get_input_shape(deploy_cfg)
+    model_inputs, _ = task_processor.create_input(img, input_shape)
+    with torch.no_grad():
+        result = model.test_step(model_inputs)
+    return result
+if __name__ == '__main__':
+    config = '/github/Tennis.ai/model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-t_8xb256-420e_aic-coco-256x192.py'
+    ckpt = '/github/Tennis.ai/model_zoo/rtmpose/rtmpose-t_8xb256-420e_aic-coco-256x192/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth'
+    img = '/github/Tennis.ai/assets/000000197388.jpg'
+    detector = build_model(config, checkpoint=ckpt)
+    result = inference(detector, config, img)

tools/deploy.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from mmdeploy/tools/deploy.py, removed some codes to only focus on ONNX report
+import argparse
+import logging
+import os
+import os.path as osp
+from functools import partial
+import mmengine
+import torch.multiprocessing as mp
+from torch.multiprocessing import Process, set_start_method
+from mmdeploy.apis import (create_calib_input_data, extract_model,
+                           get_predefined_partition_cfg, torch2onnx,
+                           torch2torchscript, visualize_model)
+from mmdeploy.apis.core import PIPELINE_MANAGER
+from mmdeploy.apis.utils import to_backend
+from mmdeploy.backend.sdk.export_info import export2SDK
+from mmdeploy.utils import (IR, Backend, get_backend, get_calib_filename,
+                            get_ir_config, get_partition_config,
+                            get_root_logger, load_config, target_wrapper)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Export model to backends.')
+    parser.add_argument('deploy_cfg', help='deploy config path')
+    parser.add_argument('model_cfg', help='model config path')
+    parser.add_argument('checkpoint', help='model checkpoint path')
+    parser.add_argument('img', help='image used to convert model model')
+    parser.add_argument(
+        '--test-img',
+        default=None,
+        type=str,
+        nargs='+',
+        help='image used to test model')
+    parser.add_argument(
+        '--work-dir',
+        default=os.getcwd(),
+        help='the dir to save logs and models')
+    parser.add_argument(
+        '--calib-dataset-cfg',
+        help='dataset config path used to calibrate in int8 mode. If not \
+            specified, it will use "val" dataset in model config instead.',
+        default=None)
+    parser.add_argument(
+        '--device', help='device used for conversion', default='cpu')
+    parser.add_argument(
+        '--log-level',
+        help='set log level',
+        default='INFO',
+        choices=list(logging._nameToLevel.keys()))
+    parser.add_argument(
+        '--show', action='store_true', help='Show detection outputs')
+    parser.add_argument(
+        '--dump-info', action='store_true', help='Output information for SDK')
+    parser.add_argument(
+        '--quant-image-dir',
+        default=None,
+        help='Image directory for quantize model.')
+    parser.add_argument(
+        '--quant', action='store_true', help='Quantize model to low bit.')
+    parser.add_argument(
+        '--uri',
+        default='192.168.1.1:60000',
+        help='Remote ipv4:port or ipv6:port for inference on edge device.')
+    args = parser.parse_args()
+    return args
+def create_process(name, target, args, kwargs, ret_value=None):
+    logger = get_root_logger()
+    logger.info(f'{name} start.')
+    log_level = logger.level
+    wrap_func = partial(target_wrapper, target, log_level, ret_value)
+    process = Process(target=wrap_func, args=args, kwargs=kwargs)
+    process.start()
+    process.join()
+    if ret_value is not None:
+        if ret_value.value != 0:
+            logger.error(f'{name} failed.')
+            exit(1)
+        else:
+            logger.info(f'{name} success.')
+def torch2ir(ir_type: IR):
+    """Return the conversion function from torch to the intermediate
+    representation.
+    Args:
+        ir_type (IR): The type of the intermediate representation.
+    """
+    if ir_type == IR.ONNX:
+        return torch2onnx
+    elif ir_type == IR.TORCHSCRIPT:
+        return torch2torchscript
+    else:
+        raise KeyError(f'Unexpected IR type {ir_type}')
+def main():
+    args = parse_args()
+    set_start_method('spawn', force=True)
+    logger = get_root_logger()
+    log_level = logging.getLevelName(args.log_level)
+    logger.setLevel(log_level)
+    pipeline_funcs = [
+        torch2onnx, torch2torchscript, extract_model, create_calib_input_data
+    ]
+    PIPELINE_MANAGER.enable_multiprocess(True, pipeline_funcs)
+    PIPELINE_MANAGER.set_log_level(log_level, pipeline_funcs)
+    deploy_cfg_path = args.deploy_cfg
+    model_cfg_path = args.model_cfg
+    checkpoint_path = args.checkpoint
+    quant = args.quant
+    quant_image_dir = args.quant_image_dir
+    # load deploy_cfg
+    deploy_cfg, model_cfg = load_config(deploy_cfg_path, model_cfg_path)
+    # create work_dir if not
+    mmengine.mkdir_or_exist(osp.abspath(args.work_dir))
+    if args.dump_info:
+        export2SDK(
+            deploy_cfg,
+            model_cfg,
+            args.work_dir,
+            pth=checkpoint_path,
+            device=args.device)
+    ret_value = mp.Value('d', 0, lock=False)
+    # convert to IR
+    ir_config = get_ir_config(deploy_cfg)
+    ir_save_file = ir_config['save_file']
+    ir_type = IR.get(ir_config['type'])
+    torch2ir(ir_type)(
+        args.img,
+        args.work_dir,
+        ir_save_file,
+        deploy_cfg_path,
+        model_cfg_path,
+        checkpoint_path,
+        device=args.device)
+    # convert backend
+    ir_files = [osp.join(args.work_dir, ir_save_file)]
+    # partition model
+    partition_cfgs = get_partition_config(deploy_cfg)
+    if partition_cfgs is not None:
+        if 'partition_cfg' in partition_cfgs:
+            partition_cfgs = partition_cfgs.get('partition_cfg', None)
+        else:
+            assert 'type' in partition_cfgs
+            partition_cfgs = get_predefined_partition_cfg(
+                deploy_cfg, partition_cfgs['type'])
+        origin_ir_file = ir_files[0]
+        ir_files = []
+        for partition_cfg in partition_cfgs:
+            save_file = partition_cfg['save_file']
+            save_path = osp.join(args.work_dir, save_file)
+            start = partition_cfg['start']
+            end = partition_cfg['end']
+            dynamic_axes = partition_cfg.get('dynamic_axes', None)
+            extract_model(
+                origin_ir_file,
+                start,
+                end,
+                dynamic_axes=dynamic_axes,
+                save_file=save_path)
+            ir_files.append(save_path)
+    backend_files = ir_files
+    # convert backend
+    backend = get_backend(deploy_cfg)
+    # convert to backend
+    PIPELINE_MANAGER.set_log_level(log_level, [to_backend])
+    if backend == Backend.TENSORRT:
+        PIPELINE_MANAGER.enable_multiprocess(True, [to_backend])
+    backend_files = to_backend(
+        backend,
+        ir_files,
+        work_dir=args.work_dir,
+        deploy_cfg=deploy_cfg,
+        log_level=log_level,
+        device=args.device,
+        uri=args.uri)
+    if args.test_img is None:
+        args.test_img = args.img
+    extra = dict(
+        backend=backend,
+        output_file=osp.join(args.work_dir, f'output_{backend.value}.jpg'),
+        show_result=args.show)
+    if backend == Backend.SNPE:
+        extra['uri'] = args.uri
+    # get backend inference result, try render
+    create_process(
+        f'visualize {backend.value} model',
+        target=visualize_model,
+        args=(model_cfg_path, deploy_cfg_path, backend_files, args.test_img,
+              args.device),
+        kwargs=extra,
+        ret_value=ret_value)
+    # get pytorch model inference result, try visualize if possible
+    create_process(
+        'visualize pytorch model',
+        target=visualize_model,
+        args=(model_cfg_path, deploy_cfg_path, [checkpoint_path],
+              args.test_img, args.device),
+        kwargs=dict(
+            backend=Backend.PYTORCH,
+            output_file=osp.join(args.work_dir, 'output_pytorch.jpg'),
+            show_result=args.show),
+        ret_value=ret_value)
+    logger.info('All process success.')
+if __name__ == '__main__':
+    main()

tools/dtw.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import numpy as np
+from .utils import get_keypoint_weight
+class DTWForKeypoints:
+    def __init__(self, keypoints1, keypoints2):
+        self.keypoints1 = keypoints1
+        self.keypoints2 = keypoints2
+    def get_dtw_path(self):
+        norm_kp1 = self.normalize_keypoints(self.keypoints1)
+        norm_kp2 = self.normalize_keypoints(self.keypoints2)
+        kp_weight = get_keypoint_weight()
+        oks, oks_unnorm = self.object_keypoint_similarity(norm_kp1,
+                               norm_kp2, keypoint_weights=kp_weight)
+        print(f"OKS max {oks.max():.2f} min {oks.min():.2f}")
+        # do the DTW, and return the path
+        cost_matrix = 1 - oks
+        dtw_dist, dtw_path = self.dynamic_time_warp(cost_matrix)
+        return dtw_path, oks, oks_unnorm
+    def normalize_keypoints(self, keypoints):
+        centroid = keypoints.mean(axis=1)[:, None]
+        max_distance = np.max(np.sqrt(np.sum((keypoints - centroid) ** 2, axis=2)),
+                              axis=1) + 1e-6
+        normalized_keypoints = (keypoints - centroid) / max_distance[:, None, None]
+        return normalized_keypoints
+    def keypoints_areas(self, keypoints):
+        min_coords = np.min(keypoints, axis=1)
+        max_coords = np.max(keypoints, axis=1)
+        areas = np.prod(max_coords - min_coords, axis=1)
+        return areas
+    def object_keypoint_similarity(self, keypoints1,
+                                keypoints2,
+                                scale_constant=0.2,
+                                keypoint_weights=None):
+        """ Calculate the Object Keypoint Similarity (OKS) for multiple objects,
+        and add weight to each keypoint. Here we choose to normalize the points
+        using centroid and max distance instead of bounding box area.
+        """
+        # Compute squared distances between all pairs of keypoints
+        sq_diff = np.sum((keypoints1[:, None] - keypoints2) ** 2, axis=-1)
+        oks = np.exp(-sq_diff / (2 * scale_constant ** 2))
+        oks_unnorm = oks.copy()
+        if keypoint_weights is not None:
+            oks = oks * keypoint_weights
+            oks = np.sum(oks, axis=-1)
+        else:
+            oks = np.mean(oks, axis=-1)
+        return oks, oks_unnorm
+    def dynamic_time_warp(self, cost_matrix, R=1000):
+        """Compute the Dynamic Time Warping distance and path between two time series.
+        If the time series is too long, it will use the Sakoe-Chiba Band constraint,
+        so time complexity is bounded at O(MR).
+        """
+        M = len(self.keypoints1)
+        N = len(self.keypoints2)
+        # Initialize the distance matrix with infinity
+        D = np.full((M, N), np.inf)
+        # Initialize the first row and column of the matrix
+        D[0, 0] = cost_matrix[0, 0]
+        for i in range(1, M):
+            D[i, 0] = D[i - 1, 0] + cost_matrix[i, 0]
+        for j in range(1, N):
+            D[0, j] = D[0, j - 1] + cost_matrix[0, j]
+        # Fill the remaining elements of the matrix within the
+        # Sakoe-Chiba Band using dynamic programming
+        for i in range(1, M):
+            for j in range(max(1, i - R), min(N, i + R + 1)):
+                cost = cost_matrix[i, j]
+                D[i, j] = cost + min(D[i - 1, j], D[i, j - 1], D[i - 1, j - 1])
+        # Backtrack to find the optimal path
+        path = [(M - 1, N - 1)]
+        i, j = M - 1, N - 1
+        while i > 0 or j > 0:
+            min_idx = np.argmin([D[i - 1, j], D[i, j - 1], D[i - 1, j - 1]])
+            if min_idx == 0:
+                i -= 1
+            elif min_idx == 1:
+                j -= 1
+            else:
+                i -= 1
+                j -= 1
+            path.append((i, j))
+        path.reverse()
+        return D[-1, -1], path
+if __name__ == '__main__':
+    from mmengine.fileio import load
+    keypoints1, kp1_scores = load('tennis1.pkl')
+    keypoints2, kp2_scores = load('tennis3.pkl')
+    # Normalize the keypoints
+    dtw = DTWForKeypoints(keypoints1, keypoints2)
+    path = dtw.get_dtw_path()
+    print(path)

tools/inferencer.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import numpy as np
+import mmcv
+from pathlib import Path
+from collections import namedtuple
+import cv2 as cv
+from tqdm import tqdm
+from mmengine.registry import init_default_scope
+from mmengine.visualization import Visualizer
+from mmpose.apis import inference_topdown, init_model
+from mmdet.apis import inference_detector, init_detector
+from .utils import filter_by_catgory, filter_by_score, Timer
+from .apis import build_onnx_model_and_task_processor, inference_onnx_model
+class PoseInferencer:
+    def __init__(self,
+                 det_cfg,
+                 pose_cfg,
+                 device='cpu') -> None:
+        # init
+        self.det_model_cfg = det_cfg.model_cfg
+        self.det_model_ckpt = det_cfg.model_ckpt
+        self.pose_model_cfg = pose_cfg.model_cfg
+        self.pose_model_ckpt = pose_cfg.model_ckpt
+        self.detector = init_detector(self.det_model_cfg,
+                                      self.det_model_ckpt,
+                                      device=device)
+        self.pose_model = init_model(self.pose_model_cfg,
+                                     self.pose_model_ckpt,
+                                     device=device)
+    def process_one_image(self, img):
+        init_default_scope('mmdet')
+        det_result = inference_detector(self.detector, img)
+        det_inst = det_result.pred_instances.cpu().numpy()
+        bboxes, scores, labels = (det_inst.bboxes,
+                                  det_inst.scores,
+                                  det_inst.labels)
+        bboxes, scores, labels = filter_by_score(bboxes, scores,
+                                                 labels, 0.5)
+        bboxes, scores, labels = filter_by_catgory(bboxes, scores, labels,
+                                                   ['person'])
+        # inference with pose model
+        init_default_scope('mmpose')
+        pose_result = inference_topdown(self.pose_model, img, bboxes)
+        if len(pose_result) == 0:
+            # no detection place holder
+            keypoints = np.zeros((1, 17, 2))
+            pts_scores = np.zeros((1, 17))
+            bboxes = np.zeros((1, 4))
+            scores = np.zeros((1, ))
+            labels = np.zeros((1, ))
+        else:
+            keypoints = np.concatenate([r.pred_instances.keypoints
+                                            for r in pose_result])
+            pts_scores = np.concatenate([r.pred_instances.keypoint_scores
+                                            for r in pose_result])
+        DetInst = namedtuple('DetInst', ['bboxes', 'scores', 'labels'])
+        PoseInst = namedtuple('PoseInst', ['keypoints', 'pts_scores'])
+        return DetInst(bboxes, scores, labels), PoseInst(keypoints, pts_scores)
+    def inference_video(self, video_path):
+        """ Inference a video with detector and pose model
+        Return:
+            all_pose: a list of PoseInst, check the namedtuple definition
+            all_det: a list of DetInst
+        """
+        video_reader = mmcv.VideoReader(video_path)
+        all_pose, all_det = [], []
+        for frame in tqdm(video_reader):
+            # inference with detector
+            det, pose = self.process_one_image(frame)
+            all_pose.append(pose)
+            all_det.append(det)
+        return all_det, all_pose
+class PoseInferencerV2:
+    """ V2 Use onnx for detection model, still use pytorch for pose model.
+    """
+    def __init__(self,
+                 det_cfg,
+                 pose_cfg,
+                 device='cpu') -> None:
+        # init
+        self.det_deploy_cfg = det_cfg.deploy_cfg
+        self.det_model_cfg = det_cfg.model_cfg
+        self.det_backend_files = det_cfg.backend_files
+        self.pose_model_cfg = pose_cfg.model_cfg
+        self.pose_model_ckpt = pose_cfg.model_ckpt
+        self.detector, self.task_processor = \
+            build_onnx_model_and_task_processor(self.det_model_cfg,
+                                                self.det_deploy_cfg,
+                                                self.det_backend_files,
+                                                device)
+        self.pose_model = init_model(self.pose_model_cfg,
+                                     self.pose_model_ckpt,
+                                     device)
+    def process_one_image(self, img):
+        init_default_scope('mmdet')
+        det_result = inference_onnx_model(self.detector,
+                                          self.task_processor,
+                                          self.det_deploy_cfg,
+                                          img)
+        det_inst = det_result[0].pred_instances.cpu().numpy()
+        bboxes, scores, labels = (det_inst.bboxes,
+                                  det_inst.scores,
+                                  det_inst.labels)
+        bboxes, scores, labels = filter_by_score(bboxes, scores,
+                                                 labels, 0.5)
+        bboxes, scores, labels = filter_by_catgory(bboxes, scores, labels,
+                                                    ['person'])
+        # inference with pose model
+        init_default_scope('mmpose')
+        pose_result = inference_topdown(self.pose_model, img, bboxes)
+        if len(pose_result) == 0:
+            # no detection place holder
+            keypoints = np.zeros((1, 17, 2))
+            pts_scores = np.zeros((1, 17))
+            bboxes = np.zeros((1, 4))
+            scores = np.zeros((1, ))
+            labels = np.zeros((1, ))
+        else:
+            keypoints = np.concatenate([r.pred_instances.keypoints
+                                            for r in pose_result])
+            pts_scores = np.concatenate([r.pred_instances.keypoint_scores
+                                            for r in pose_result])
+        DetInst = namedtuple('DetInst', ['bboxes', 'scores', 'labels'])
+        PoseInst = namedtuple('PoseInst', ['keypoints', 'pts_scores'])
+        return DetInst(bboxes, scores, labels), PoseInst(keypoints, pts_scores)
+    def inference_video(self, video_path):
+        """ Inference a video with detector and pose model
+        Return:
+            all_pose: a list of PoseInst, check the namedtuple definition
+            all_det: a list of DetInst
+        """
+        video_reader = mmcv.VideoReader(video_path)
+        all_pose, all_det = [], []
+        for frame in tqdm(video_reader):
+            # inference with detector
+            det, pose = self.process_one_image(frame)
+            all_pose.append(pose)
+            all_det.append(det)
+        return all_det, all_pose

tools/manager.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import mim
+from pathlib import Path
+from mim.utils import get_installed_path, echo_success
+from mmengine.config import Config
+class Manager:
+    def __init__(self, path=None) -> None:
+        """
+        Params:
+            - path: root path of projects to save checkpoints and configs
+        """
+        if path:
+            self.path = Path(path)
+        else:
+            self.path = Path(__file__).parents[1]
+        self.keys = ['weight', 'config', 'model', 'training_data']
+    def get_model_infos(self, package_name, keyword: str=None):
+        """ because mim search is too strict,
+        I want to search by keyword, not a strict match
+        """
+        model_infos = mim.get_model_info(package_name)
+        model_names = model_infos.index
+        info_keys = model_infos.columns.tolist()
+        keys = self.intersect_keys(info_keys,
+                                   self.keys)
+        if keyword is None:
+            return model_infos[:, keys]
+        # get valid names, which contains the keyword
+        valid_names = [name for name in model_names
+                                 if keyword in name]
+        filter_infos = model_infos.loc[valid_names, keys]
+        return filter_infos
+    def intersect_keys(self, keys1 , keys2):
+        return list(set(keys1) & set(keys2))
+    def download(self, package, model, config_only=False):
+        """ Use model names to download checkpoints and configs.
+        Args:
+            - package: package name, e.g. mmdet
+            - model: model name, e.g. faster_rcnn or faster_rcnn_r50_fpn_1x_coco
+            - config_only: only download configs, which is helpful when you
+                            already download checkpoints fast through other ways.
+        """
+        infos = self.get_model_infos(package, model)
+        for model, info in infos.iterrows():
+            # get destination path
+            hyper_name = info['model']
+            dst_path = self.path / 'model_zoo' / hyper_name / model
+            dst_path.mkdir(parents=True, exist_ok=True)
+            if config_only:
+                # get config path of the package
+                installed_path = Path(get_installed_path(package))
+                config_path = info['config']
+                config_path = installed_path / '.mim' / config_path
+                # build and dump config
+                config_obj = Config.fromfile(config_path)
+                saved_config_path = dst_path / f'{model}.py'
+                config_obj.dump(saved_config_path)
+                echo_success(
+                    f'Successfully dumped {model}.py to {dst_path}')
+            else:
+                mim.download(package, [model], dest_root=dst_path)
+if __name__ == '__main__':
+    m = Manager()
+    print(m.get_model_infos('mmdet', 'det'))
+    # m.download('mmpose', 'rtmpose-t_8xb256-420e_aic-coco-256x192', config_only=True)

tools/utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from mmdet.datasets import CocoDataset
+import time
+from pathlib import Path
+from ffmpy import FFmpeg
+import shutil
+import tempfile
+from easydict import EasyDict
+import numpy as np
+def coco_keypoint_id_table(reverse=False):
+    id2name = { 0: 'nose',
+                1: 'left_eye',
+                2: 'right_eye',
+                3: 'left_ear',
+                4: 'right_ear',
+                5: 'left_shoulder',
+                6: 'right_shoulder',
+                7: 'left_elbow',
+                8: 'right_elbow',
+                9: 'left_wrist',
+                10: 'right_wrist',
+                11: 'left_hip',
+                12: 'right_hip',
+                13: 'left_knee',
+                14: 'right_knee',
+                15: 'left_ankle',
+                16: 'right_ankle'}
+    if reverse:
+        return {v: k for k, v in id2name.items()}
+    return id2name
+def get_skeleton():
+    """ My skeleton links, I deleted some links from default coco style.
+    """
+    SKELETON = EasyDict()
+    SKELETON.head = [[0,1], [0,2], [1,3], [2,4]]
+    SKELETON.left_arm = [[5, 7], [7, 9]]
+    SKELETON.right_arm = [[6, 8], [8, 10]]
+    SKELETON.left_leg = [[11, 13], [13, 15]]
+    SKELETON.right_leg = [[12, 14], [14, 16]]
+    SKELETON.body = [[5, 6], [5, 11], [6, 12], [11, 12]]
+    return SKELETON
+def get_keypoint_weight(low_weight_ratio=0.1, mid_weight_ratio=0.5):
+    """ Get keypoint weight, used in object keypoint similarity,
+    `low_weight_names` are points I want to pay less attention.
+    """
+    low_weight_names = ['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear']
+    mid_weight_names = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
+    logtis = np.ones(17)
+    name2id = coco_keypoint_id_table(reverse=True)
+    low_weight_id = [name2id[n] for n in low_weight_names]
+    mid_weight_id = [name2id[n] for n in mid_weight_names]
+    logtis[low_weight_id] = low_weight_ratio
+    logtis[mid_weight_id] = mid_weight_ratio
+    weights = logtis / np.sum(logtis)
+    return weights
+def coco_cat_id_table():
+    classes = CocoDataset.METAINFO['classes']
+    id2name = {i: name for i, name in enumerate(classes)}
+    return id2name
+def filter_by_catgory(bboxes, scores, labels, names):
+    """ Filter labels by classes
+    Args:
+        - labels: list of labels, each label is a dict
+        - classes: list of class names
+    """
+    id2name = coco_cat_id_table()
+    # names of labels
+    label_names = [id2name[id] for id in labels]
+    # filter by class names
+    mask = np.isin(label_names, names)
+    return bboxes[mask], scores[mask], labels[mask]
+def filter_by_score(bboxes, scores, labels, score_thr):
+    """ Filter bboxes by score threshold
+    Args:
+        - bboxes: list of bboxes, each bbox is a dict
+        - score_thr: score threshold
+    """
+    mask = scores > score_thr
+    return bboxes[mask], scores[mask], labels[mask]
+def convert_video_to_playable_mp4(video_path: str) -> str:
+    """ Copied from gradio
+    Convert the video to mp4. If something goes wrong return the original video.
+    """
+    try:
+        output_path = Path(video_path).with_suffix(".mp4")
+        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+            shutil.copy2(video_path, tmp_file.name)
+            # ffmpeg will automatically use h264 codec (playable in browser) when converting to mp4
+            ff = FFmpeg(
+                inputs={str(tmp_file.name): None},
+                outputs={str(output_path): None},
+                global_options="-y -loglevel quiet",
+            )
+            ff.run()
+    except:
+        print(f"Error converting video to browser-playable format {str(e)}")
+        output_path = video_path
+    return str(output_path)
+class Timer:
+    def __init__(self):
+        self.start_time = time.time()
+    def click(self):
+        used_time = time.time() - self.start_time
+        self.start_time = time.time()
+        return used_time
+    def start(self):
+        self.start_time = time.time()

tools/visualizer.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import cv2
+import numpy as np
+from skimage import draw, io
+from PIL import Image, ImageDraw, ImageFont
+from easydict import EasyDict
+from typing import Union
+from .utils import get_skeleton, Timer
+class FastVisualizer:
+    """ Use skimage to draw, which is much faster than matplotlib, and
+    more beatiful than opencv.😎
+    """
+    # TODO: modify color input parameter
+    def __init__(self, image=None) -> None:
+        self.set_image(image)
+        self.colors = self.get_pallete()
+        self.skeleton = get_skeleton()
+        self.lvl_tresh = self.set_level([0.3, 0.6, 0.8])
+    def set_image(self, image: Union[str, np.ndarray]):
+        if isinstance(image, str):
+            self.image = cv2.imread(image)
+        elif isinstance(image, np.ndarray) or image is None:
+            self.image = image
+        else:
+            raise TypeError(f"Type {type(image)} is not supported")
+    def get_image(self):
+        return self.image
+    def draw_box(self, box_coord, color=(25, 113, 194), alpha=1.0):
+        """ Draw a box on the image
+        Args:
+            box_coord: a list of [xmin, ymin, xmax, ymax]
+            alpha: the alpha of the box
+            color: the edge color of the box
+        """
+        xmin, ymin, xmax, ymax = box_coord
+        rr, cc = draw.rectangle_perimeter((ymin, xmin), (ymax, xmax))
+        draw.set_color(self.image, (rr, cc), color, alpha=alpha)
+        return self
+    def draw_rectangle(self, box_coord, color=(25, 113, 194), alpha=1.0):
+        xmin, ymin, xmax, ymax = box_coord
+        rr, cc = draw.rectangle((ymin, xmin), (ymax, xmax))
+        draw.set_color(self.image, (rr, cc), color, alpha=alpha)
+        return self
+    def draw_point(self, point_coord, radius=5, color=(25, 113, 194), alpha=1.0):
+        """ Coord in (x, y) format, but will be converted to (y, x)
+        """
+        x, y = point_coord
+        rr, cc = draw.disk((y, x), radius=radius)
+        draw.set_color(self.image, (rr, cc), color, alpha=alpha)
+        return self
+    def draw_line(self, start_point, end_point, color=(25, 113, 194), alpha=1.0):
+        """ Not used, because I can't produce smooth line.
+        """
+        cv2.line(self.image, start_point, end_point, color.tolist(), 2,
+                 cv2.LINE_AA)
+        return self
+    def draw_line_aa(self, start_point, end_point, color=(25, 113, 194), alpha=1.0):
+        """ Not used, because I can't produce smooth line.
+        """
+        x1, y1 = start_point
+        x2, y2 = end_point
+        rr, cc, val = draw.line_aa(y1, x1, y2, x2)
+        draw.set_color(self.image, (rr, cc), color, alpha=alpha)
+        return self
+    def draw_thick_line(self, start_point, end_point, thickness=1, color=(25, 113, 194), alpha=1.0):
+        """ Not used, because I can't produce smooth line.
+        """
+        x1, y1 = start_point
+        x2, y2 = end_point
+        dx, dy = x2 - x1, y2 - y1
+        length = np.sqrt(dx * dx + dy * dy)
+        cos, sin = dx / length, dy / length
+        half_t = thickness / 2.0
+        # Calculate the polygon vertices
+        vertices_x = [x1 - half_t * sin, x1 + half_t * sin,
+                        x2 + half_t * sin, x2 - half_t * sin]
+        vertices_y = [y1 + half_t * cos, y1 - half_t * cos,
+                        y2 - half_t * cos, y2 + half_t * cos]
+        rr, cc = draw.polygon(vertices_y, vertices_x)
+        draw.set_color(self.image, (rr, cc), color, alpha)
+        return self
+    def draw_text(self, text, position,
+                  font_path='assets/SmileySans/SmileySans-Oblique.ttf',
+                  font_size=20,
+                  text_color=(255, 255, 255)):
+        """ Position is the left top corner of the text
+        """
+        # Convert the NumPy array to a PIL image
+        pil_image = Image.fromarray(np.uint8(self.image))
+        # Load the font (default is Arial)
+        font = ImageFont.truetype(font_path, font_size)
+        # Create a drawing object
+        draw = ImageDraw.Draw(pil_image)
+        # Add the text to the image
+        draw.text(position, text, font=font, fill=text_color)
+        # Convert the PIL image back to a NumPy array
+        result = np.array(pil_image)
+        self.image = result
+        return self
+    def xyhw_to_xyxy(self, box):
+        hw = box[2:]
+        x1y1 = box[:2] - hw / 2
+        x2y2 = box[:2] + hw / 2
+        return np.concatenate([x1y1, x2y2]).astype(np.int32)
+    def draw_line_in_discrete_style(self, start_point, end_point, size=2, sample_points=3,
+                                    color=(25, 113, 194), alpha=1.0):
+        """ When drawing continous line, it is super fuzzy, and I can't handle them
+        very well even tried OpneCV & PIL all kinds of ways. This is a workaround.
+        The discrete line will be represented with few sampled cubes along the line,
+        and it is exclusive with start & end points.
+        """
+        # sample points
+        points = np.linspace(start_point, end_point, sample_points + 2)[1:-1]
+        for p in points:
+            rectangle_xyhw = np.array((p[0], p[1], size, size))
+            rectangle_xyxy = self.xyhw_to_xyxy(rectangle_xyhw)
+            self.draw_rectangle(rectangle_xyxy, color, alpha)
+        return self
+    def draw_human_keypoints(self, keypoints, scores=None, factor=20, draw_skeleton=False):
+        """ Draw skeleton on the image, and give different color according
+        to similarity scores.
+        """
+        # get max length of skeleton
+        max_x, max_y = np.max(keypoints, axis=0)
+        min_x, min_y = np.min(keypoints, axis=0)
+        max_length = max(max_x - min_x, max_y - min_y)
+        if max_length < 1: return self
+        cube_size = max_length // factor
+        line_cube_size = cube_size // 2
+        # draw skeleton in discrete style
+        if draw_skeleton:
+            for key, links in self.skeleton.items():
+                links = np.array(links)
+                start_points = keypoints[links[:, 0]]
+                end_points = keypoints[links[:, 1]]
+                for s, e in zip(start_points, end_points):
+                    self.draw_line_in_discrete_style(s, e, line_cube_size,
+                    color=self.colors[key], alpha=0.9)
+        # draw points
+        if scores is None:  # use vamos color
+            lvl_names = ['vamos'] * len(keypoints)
+        else: lvl_names = self.score_level_names(scores)
+        for idx, (point, lvl_name) in enumerate(zip(keypoints, lvl_names)):
+            if idx in set((1, 2, 3, 4)):
+                continue # do not draw eyes and years
+            rectangle_xyhw = np.array((point[0], point[1], cube_size, cube_size))
+            rectangle_xyxy = self.xyhw_to_xyxy(rectangle_xyhw)
+            self.draw_rectangle(rectangle_xyxy,
+                                color=self.colors[lvl_name],
+                                alpha=0.8)
+        return self
+    def draw_score_bar(self, score, factor=50, bar_ratio=7):
+        """ Draw a score bar on the left top of the image.
+        factor: the value of image longer edge divided by the bar height
+        bar_ratio: the ratio of bar width to bar height
+        """
+        # calculate bar's height and width
+        long_edge = np.max(self.image.shape[:2])
+        short_edge = np.min(self.image.shape[:2])
+        bar_h = long_edge // factor
+        bar_w = bar_h * bar_ratio
+        if bar_w * 3 > short_edge:
+            # when the image width is not enough
+            bar_w = short_edge // 4
+            bar_h = bar_w // bar_ratio
+        cube_size = bar_h
+        # bar's base position
+        bar_start_point = (2*bar_h, 2*bar_h)
+        # draw bar horizontally, and record the position of each word
+        word_positions = []
+        box_coords = []
+        colors = [self.colors.bad, self.colors.good, self.colors.vamos]
+        for i, color in enumerate(colors):
+            x0, y0 = bar_start_point[0] + i*bar_w,  bar_start_point[1]
+            x1, y1 = x0 + bar_w - 1,  y0 + bar_h
+            box_coord = np.array((x0, y0, x1, y1), dtype=np.int32)
+            self.draw_rectangle(box_coord, color=color)
+            box_coords.append(box_coord)
+            word_positions.append(np.array((x0, y1 + bar_h // 2)))
+        # calculate cube position according to score
+        lvl, lvl_ratio, lvl_name = self.score_level(score)
+        # the first level start point is the first bar
+        cube_lvl_start_x0 = [box_coord[0] - cube_size // 2 if i != 0
+                             else box_coord[0]
+                             for i, box_coord in enumerate(box_coords)]
+        # process the last level, I want the cube stays in the bar
+        level_length = bar_w if lvl == 1 else bar_w - cube_size // 2
+        cube_x0 = cube_lvl_start_x0[lvl] + lvl_ratio * level_length
+        cube_y0 = bar_start_point[1] - bar_h // 2 - cube_size
+        cube_x1 = cube_x0 + cube_size
+        cube_y1 = cube_y0 + cube_size
+        # draw cube
+        self.draw_rectangle((cube_x0, cube_y0, cube_x1, cube_y1),
+                             color=self.colors.cube)
+        # enlarge the box, to emphasize the level
+        enlarged_box = box_coords[lvl].copy()
+        enlarged_box[:2] = enlarged_box[:2] - bar_h // 8
+        enlarged_box[2:] = enlarged_box[2:] + bar_h // 8
+        self.draw_rectangle(enlarged_box, color=self.colors[lvl_name])
+        # draw text
+        if lvl_name == 'vamos':
+            lvl_name = 'vamos!!'    # exciting!
+        self.draw_text(lvl_name.capitalize(),
+                       word_positions[lvl],
+                       font_size=bar_h * 2,
+                       text_color=tuple(colors[lvl].tolist()))
+        return self
+    def draw_non_transparent_area(self, box_coord, alpha=0.2, extend_ratio=0.1):
+        """ Make image outside the box transparent using alpha blend
+        """
+        x1, y1, x2, y2 = box_coord.astype(np.int32)
+        # enlarge the box for 10%
+        max_len = max((x2 - x1), (y2 - y1))
+        extend_len = int(max_len * extend_ratio)
+        x1, y1 = x1 - extend_len, y1 - extend_len
+        x2, y2 = x2 + extend_len, y2 + extend_len
+        # clip the box
+        h, w = self.image.shape[:2]
+        x1, y1, x2, y2 = np.clip((x1,y1,x2,y2), a_min=0,
+                                                a_max=(w,h,w,h))
+        # Create a white background color
+        bg_color = np.ones_like(self.image) * 255
+        # Copy the box region from the image
+        bg_color[y1:y2, x1:x2] = self.image[y1:y2, x1:x2]
+        # Alpha blend inplace
+        self.image[:] = self.image * alpha + bg_color * (1 - alpha)
+        return self
+    def draw_logo(self, logo='assets/logo.png', factor=30, shift=20):
+        """ Draw logo on the right bottom of the image.
+        """
+        H, W = self.image.shape[:2]
+        # load logo
+        logo_img = Image.open(logo)
+        # scale logo
+        logo_h = self.image.shape[0] // factor
+        scale_size = logo_h / logo_img.size[1]
+        logo_w = int(logo_img.size[0] * scale_size)
+        logo_img = logo_img.resize((logo_w, logo_h))
+        # convert to RGBA
+        image = Image.fromarray(self.image).convert("RGBA")
+        # alpha blend
+        image.alpha_composite(logo_img, (W - logo_w - shift,
+                                         H - logo_h - shift))
+        self.image = np.array(image.convert("RGB"))
+        return self
+    def score_level(self, score):
+        """ Return the level according to level thresh.
+        """
+        t = self.lvl_tresh
+        if score < t[1]: # t[0] might bigger than 0
+            ratio = (score - t[0]) / (t[1] - t[0])
+            ratio = np.clip(ratio, a_min=0, a_max=1)
+            return 0, ratio, 'bad'
+        elif score < t[2]:
+            ratio = (score - t[1]) / (t[2] - t[1])
+            return 1, ratio, 'good'
+        else:
+            ratio = (score - t[2]) / (1 - t[2])
+            return 2, ratio, 'vamos'
+    def score_level_names(self, scores):
+        """ Get multiple score level, return numpy array.
+        np.vectorize does not speed up loop, but it is convenient.
+        """
+        t = self.lvl_tresh
+        func_lvl_name = lambda x: 'bad' if x < t[1] else 'good' \
+                                        if x < t[2] else 'vamos'
+        lvl_names = np.vectorize(func_lvl_name)(scores)
+        return lvl_names
+    def set_level(self, thresh):
+        """ Set level thresh for bad, good, vamos.
+        """
+        from collections import namedtuple
+        Level = namedtuple('Level', ['zero', 'good', 'vamos'])
+        return Level(thresh[0], thresh[1], thresh[2])
+    def get_pallete(self):
+        PALLETE = EasyDict()
+        # light set
+        # PALLETE.bad = np.array([253, 138, 138])
+        # PALLETE.good = np.array([168, 209, 209])
+        # PALLETE.vamos = np.array([241, 247, 181])
+        # PALLETE.cube = np.array([158, 161, 212])
+        # dark set, set 80% brightness
+        PALLETE.bad = np.array([204, 111, 111])
+        PALLETE.good = np.array([143, 179, 179])
+        PALLETE.vamos = np.array([196, 204, 124])
+        PALLETE.vamos = np.array([109, 169, 228])
+        PALLETE.cube = np.array([152, 155, 204])
+        PALLETE.left_arm = np.array([218, 119, 242])
+        PALLETE.right_arm = np.array([151, 117, 250])
+        PALLETE.left_leg = np.array([255, 212, 59])
+        PALLETE.right_leg = np.array([255, 169, 77])
+        PALLETE.head = np.array([134, 142, 150])
+        PALLETE.body = np.array([134, 142, 150])
+        # convert rgb to bgr
+        for k, v in PALLETE.items():
+            PALLETE[k] = v[::-1]
+        return PALLETE
+if __name__ == '__main__':
+    vis =  FastVisualizer()
+    image = '/github/Tennis.ai/assets/tempt_test.png'
+    vis.set_image(image)
+    np.random.seed(0)
+    keypoints = np.random.randint(300, 600, (17, 2))
+    from utils import Timer
+    t= Timer()
+    t.start()
+    vis.draw_score_bar(0.94)
+    # vis.draw_skeleton(keypoints)
+    # vis.draw_non_transparent_area((0, 0, 100, 100), alpha=0.2)
+    vis.draw_logo()
+    cv2.imshow('test', vis.image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()