pose_demo_01

Running

App Files Files Community

Maksym-Lysyi commited on Jul 3, 2024

Commit

e3641b1

1 Parent(s): 3964794

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +8 -0
.gitignore +5 -0
Dockerfile +19 -0
app.py +117 -0
config.py +373 -0
easy_ViTPose/__init__.py +5 -0
easy_ViTPose/config.yaml +14 -0
easy_ViTPose/configs/ViTPose_aic.py +20 -0
easy_ViTPose/configs/ViTPose_ap10k.py +22 -0
easy_ViTPose/configs/ViTPose_apt36k.py +22 -0
easy_ViTPose/configs/ViTPose_coco.py +18 -0
easy_ViTPose/configs/ViTPose_coco_25.py +20 -0
easy_ViTPose/configs/ViTPose_common.py +195 -0
easy_ViTPose/configs/ViTPose_mpii.py +18 -0
easy_ViTPose/configs/ViTPose_wholebody.py +20 -0
easy_ViTPose/configs/__init__.py +0 -0
easy_ViTPose/datasets/COCO.py +556 -0
easy_ViTPose/datasets/HumanPoseEstimation.py +17 -0
easy_ViTPose/datasets/__init__.py +0 -0
easy_ViTPose/easy_ViTPose.egg-info/PKG-INFO +4 -0
easy_ViTPose/easy_ViTPose.egg-info/SOURCES.txt +35 -0
easy_ViTPose/easy_ViTPose.egg-info/dependency_links.txt +1 -0
easy_ViTPose/easy_ViTPose.egg-info/top_level.txt +2 -0
easy_ViTPose/inference.py +334 -0
easy_ViTPose/sort.py +266 -0
easy_ViTPose/to_onnx.ipynb +0 -0
easy_ViTPose/to_trt.ipynb +0 -0
easy_ViTPose/train.py +174 -0
easy_ViTPose/vit_models/__init__.py +8 -0
easy_ViTPose/vit_models/backbone/__init__.py +0 -0
easy_ViTPose/vit_models/backbone/vit.py +394 -0
easy_ViTPose/vit_models/head/__init__.py +0 -0
easy_ViTPose/vit_models/head/topdown_heatmap_base_head.py +120 -0
easy_ViTPose/vit_models/head/topdown_heatmap_simple_head.py +334 -0
easy_ViTPose/vit_models/losses/__init__.py +16 -0
easy_ViTPose/vit_models/losses/classfication_loss.py +41 -0
easy_ViTPose/vit_models/losses/heatmap_loss.py +83 -0
easy_ViTPose/vit_models/losses/mesh_loss.py +402 -0
easy_ViTPose/vit_models/losses/mse_loss.py +151 -0
easy_ViTPose/vit_models/losses/multi_loss_factory.py +279 -0
easy_ViTPose/vit_models/losses/regression_loss.py +444 -0
easy_ViTPose/vit_models/model.py +24 -0
easy_ViTPose/vit_models/optimizer.py +15 -0
easy_ViTPose/vit_utils/__init__.py +6 -0
easy_ViTPose/vit_utils/dist_util.py +212 -0
easy_ViTPose/vit_utils/inference.py +93 -0
easy_ViTPose/vit_utils/logging.py +133 -0
easy_ViTPose/vit_utils/nms/__init__.py +0 -0
easy_ViTPose/vit_utils/nms/cpu_nms.c +0 -0
easy_ViTPose/vit_utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__
+pose_env_1
+testing
+vit_env
+vit_test
+test_vit_model.ipynb
+models
+models_2

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__
+pose_env_1
+testing
+vit_env
+test_vit_model.ipynb

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10
+WORKDIR /app
+COPY requirements.txt .
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN pip install --upgrade pip
+# --no-cache-dir
+RUN pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ENV USE_NNPACK=0
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import gradio as gr
+from main_func import video_identity
+with gr.Blocks() as demo:
+    with gr.Row(variant='compact'):
+        with gr.Column():
+            gr.Markdown("#### Dynamic Time Warping:")
+            with gr.Row(variant='compact'):
+                dtw_mean = gr.Slider(
+                    value=0.5,
+                    minimum=0,
+                    maximum=1.0,
+                    step=0.05,
+                    label="Winsorize Mean"
+                )
+                dtw_filter = gr.Slider(
+                    value=3,
+                    minimum=1,
+                    maximum=20,
+                    step=1,
+                    label="Savitzky-Golay Filter"
+                )
+            gr.Markdown("#### Thresholds:")
+            with gr.Row(variant='compact'):
+                angles_sensitive = gr.Number(
+                    value=15,
+                    minimum=0,
+                    maximum=75,
+                    step=1,
+                    min_width=100,
+                    label="Sensitive"
+                )
+                angles_common = gr.Number(
+                    value=25,
+                    minimum=0,
+                    maximum=75,
+                    step=1,
+                    min_width=100,
+                    label="Standart"
+                )
+                angles_insensitive = gr.Number(
+                    value=45,
+                    minimum=0,
+                    maximum=75,
+                    step=1,
+                    min_width=100,
+                    label="Insensitive"
+                )
+            gr.Markdown("#### Patience:")
+            trigger_state = gr.Radio(value="three", choices=["three", "two"], label="Trigger Count")
+        input_teacher = gr.Video(show_share_button=False, show_download_button=False, sources=["upload"], label="Teacher's Video")
+        input_student = gr.Video(show_share_button=False, show_download_button=False, sources=["upload"], label="Student's Video")
+    with gr.Accordion("Clarifications:", open=True):
+        with gr.Accordion("Dynamic Time Warping:", open=False):
+            gr.Markdown("""
+            Dynamic Time Warping is an algorithm that performs frame-by-frame alignment for videos with different speeds.
+            - **Winsorized mean**: Determines the portion of DTW paths, sorted from best to worst, to use for generating the mean DTW alignment. Reasonable values range from 0.25 to 0.6.
+            - **Savitzky-Golay Filter**: Enhances the capabilities of the Winsorized mean, making DTW alignment more similar to a strict line. Reasonable values range from 2 to 10.
+            """)
+        with gr.Accordion("Thresholds:", open=False):
+            gr.Markdown("""
+            Thresholds are used to identify student errors in dance. If the difference in angle between the teacher's and student's videos exceeds this threshold, it is counted as an error.
+            - **Sensitive**: A threshold that is currently not used.
+            - **Standard**: A threshold for most angles. Reasonable values range from 20 to 40.
+            - **Insensitive**: A threshold for difficult areas, such as hands and toes. Reasonable values range from 35 to 55.
+            """)
+        with gr.Accordion("Patience:", open=False):
+            gr.Markdown("""
+            Patience helps prevent model errors by highlighting only errors detected in consecutive frames.
+            - **Three**: Utilizes 3 consecutive frames for error detection.
+            - **Two**: Utilizes 2 consecutive frames for error detection.
+            Both options can be used interchangeably.
+            """)
+    with gr.Row():
+        gr_button = gr.Button("Run Pose Comparison")
+    with gr.Row():
+        gr.HTML("<div style='height: 100px;'></div>")
+    with gr.Row():
+        output_merged = gr.Video(show_download_button=True)
+    with gr.Row():
+        general_log = gr.TextArea(lines=10, max_lines=9999, label="Error log")
+    gr_button.click(
+        fn=video_identity,
+        inputs=[dtw_mean, dtw_filter, angles_sensitive, angles_common, angles_insensitive, trigger_state, input_teacher, input_student],
+        outputs=[output_merged, general_log]
+    )
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,373 @@

+CONNECTIONS_VIT_FULL = [
+    # head
+    (0, 2),
+    (0, 1),
+    (2, 4),
+    (1, 3),
+    (0, 6),
+    (0, 5),
+    # right arm
+    (6, 8),
+    (8, 10),
+    # right hand
+    (10, 112),
+    # Big toe 1
+    (112, 113),
+    (113, 114),
+    (114, 115),
+    (115, 116),
+    # toe 2
+    (112, 117),
+    (117, 118),
+    (118, 119),
+    (119, 120),
+    # toe 3
+    (112, 121),
+    (121, 122),
+    (122, 123),
+    (123, 124),
+    # toe 4
+    (112, 125),
+    (125, 126),
+    (126, 127),
+    (127, 128),
+    # toe 5
+    (112, 129),
+    (129, 130),
+    (130, 131),
+    (131, 132),
+    # left arm
+    (5, 7),
+    (7, 9),
+    # left hand
+    (9, 91),
+    # Big toe 1
+    (91, 92),
+    (92, 93),
+    (93, 94),
+    (94, 95),
+    # toe 2
+    (91, 96),
+    (96, 97),
+    (97, 98),
+    (98, 99),
+    # toe 3
+    (91, 100),
+    (100, 101),
+    (101, 102),
+    (102, 103),
+    # toe 4
+    (91, 104),
+    (104, 105),
+    (105, 106),
+    (106, 107),
+    # toe 5
+    (91, 108),
+    (108, 109),
+    (109, 110),
+    (110, 111),
+    # torso
+    (6, 5),
+    (12, 11),
+    (6, 12),
+    (5, 11),
+    # right leg
+    (12, 14),
+    (14, 16),
+    # right foot
+    (16, 22),
+    (22, 21),
+    (22, 20),
+    # left leg
+    (11, 13),
+    (13, 15),
+    # left foot
+    (15, 19),
+    (19, 18),
+    (19, 17),
+]
+EDGE_GROUPS_FOR_ERRORS = [
+    [0, 2, 4],
+    [0, 1, 3],
+    # neck
+    [6, 0, 2],
+    [5, 0, 1],
+    # right arm
+    # right shoulder
+    [5, 6, 8],
+    # right elbow
+    [6, 8, 10],
+    # right hand
+    [8, 10, 121],
+    [112, 114, 116],
+    [112, 117, 120],
+    [112, 121, 124],
+    [112, 125, 128],
+    [112, 129, 132],
+    # left arm
+    # left shoulder
+    [6, 5, 7],
+    # left elbow
+    [5, 7, 9],
+    # left hand
+    [7, 9, 100],
+    [91, 93, 95],
+    [91, 96, 99],
+    [91, 100, 103],
+    [91, 104, 107],
+    [91, 108, 111],
+    # right leg
+    # right upper-leg
+    [6, 12, 14],
+    # right middle-leg
+    [12, 14, 16],
+    # right lower-leg
+    [14, 16, 22],
+    [16, 22, 21],
+    [16, 22, 20],
+    # left leg
+    # left upper-leg
+    [5, 11, 13],
+    # left middle-leg
+    [11, 13, 15],
+    # left lower-leg
+    [13, 15, 19],
+    [15, 19, 17],
+    [15, 19, 18],
+]
+CONNECTIONS_FOR_ERROR = [
+    # head
+    (0, 2),
+    (2, 4),
+    (0, 1),
+    (1, 3),
+    # right arm
+    (6, 0),
+    (8, 6),
+    (10, 8),
+    # right hand
+    # (121, 10),
+    (112, 114),
+    (114, 116),
+    (112, 117),
+    (117, 120),
+    (112, 121),
+    (121, 124),
+    (112, 125),
+    (125, 128),
+    (112, 129),
+    (129, 132),
+    # left arm
+    (5, 0),
+    (7, 5),
+    (9, 7),
+    # left hand
+    # (100, 9),
+    (91, 93),
+    (93, 95),
+    (91, 96),
+    (96, 99),
+    (91, 100),
+    (100, 103),
+    (91, 104),
+    (104, 107),
+    (91, 108),
+    (108, 111),
+    # torso
+    (6, 12),
+    (5, 11),
+    # right leg
+    (12, 14),
+    (14, 16),
+    (16, 22),
+    (22, 21),
+    (22, 20),
+    # left leg
+    (11, 13),
+    (13, 15),
+    (15, 19),
+    (19, 17),
+    (19, 18),
+]
+def get_thresholds(sensetive_error, general_error, unsensetive_error):
+    thresholds = [
+        general_error,
+        general_error,
+        general_error,
+        general_error,
+        general_error,
+        general_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        general_error,
+        general_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        general_error,
+        general_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+        general_error,
+        general_error,
+        unsensetive_error,
+        unsensetive_error,
+        unsensetive_error,
+    ]
+    return thresholds
+EDGE_GROUPS_FOR_SUMMARY = {
+    (2, 4): "Head position is incorrect",
+    (1, 3): "Head position is incorrect",
+    # neck
+    (0, 2): "Head position is incorrect",
+    (0, 1): "Head position is incorrect",
+    # right arm
+    # right shoulder
+    (6, 8): "Right shoulder position is incorrect",
+    # right elbow
+    (8, 10): "Right elbow position is incorrect",
+    # right hand
+    (10, 121): "Right hand's palm position is incorrect",
+    (114, 116): "Right thumb finger position is incorrect",
+    (117, 120): "Right index finger position is incorrect",
+    (121, 124): "Right middle finger position is incorrect",
+    (125, 128): "Right ring finger position is incorrect",
+    (129, 132): "Right pinky finger position is incorrect",
+    # left arm
+    # left shoulder
+    (5, 7): "Left shoulder position is incorrect",
+    # left elbow
+    (7, 9): "Left elbow position is incorrect",
+    # left hand
+    (9, 100): "Left hand palm position is incorrect",
+    (93, 95): "Left thumb finger position is incorrect",
+    (96, 99): "Left index finger position is incorrect",
+    (100, 103): "Left middle finger position is incorrect",
+    (104, 107): "Left ring finger position is incorrect",
+    (108, 111): "Left pinky finger position is incorrect",
+    # right leg
+    # right upper-leg
+    (12, 14): "Right thigh position is incorrect",
+    # right middle-leg
+    (14, 16): "Right shin position is incorrect",
+    # right lower-leg
+    (16, 22): "Right foot position is incorrect",
+    (22, 21): "Right shin position is incorrect",
+    (22, 20): "Right shin position is incorrect",
+    # left leg
+    # left upper-leg
+    (11, 13): "Left thigh position is incorrect",
+    # left middle-leg
+    (13, 15): "Left shin position is incorrect",
+    # left lower-leg
+    (15, 19): "Left foot position is incorrect",
+    (19, 17): "Left shin position is incorrect",
+    (19, 18): "Left shin position is incorrect"
+}

easy_ViTPose/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .inference import VitInference
+__all__ = [
+    'VitInference'
+]

easy_ViTPose/config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# Train config ---------------------------------------
+log_level: logging.INFO
+seed: 0
+deterministic: True
+cudnn_benchmark: True # Use cudnn
+resume_from: "ckpts/og-vitpose-s.pth" # CKPT path
+# resume_from: False
+gpu_ids: [0]
+launcher: 'none' # When distributed training ['none', 'pytorch', 'slurm', 'mpi']
+use_amp: True
+validate: True
+autoscale_lr: False
+dist_params:
+  ...

easy_ViTPose/configs/ViTPose_aic.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=14,
+    dataset_joints=14,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+    ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_ap10k.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_apt36k.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_coco.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=list(range(17)),
+    inference_channel=list(range(17)))
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_coco_25.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=25,
+    dataset_joints=25,
+    dataset_channel=[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                      16, 17, 18, 19, 20, 21, 22, 23, 24], ],
+    inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       16, 17, 18, 19, 20, 21, 22, 23, 24])
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_common.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Common configuration
+optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(
+                                    num_layers=12,
+                                    layer_decay_rate=1 - 2e-4,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=300,
+    warmup_ratio=0.001,
+    step=[3])
+total_epochs = 4
+target_type = 'GaussianHeatmap'
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+data_root = '/home/adryw/dataset/COCO17'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=6,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg)
+)
+model_small = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+model_base = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+model_large = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+model_huge = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))

easy_ViTPose/configs/ViTPose_mpii.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=16,
+    dataset_joints=16,
+    dataset_channel=list(range(16)),
+    inference_channel=list(range(16)))
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/ViTPose_wholebody.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .ViTPose_common import *
+# Channel configuration
+channel_cfg = dict(
+    num_output_channels=133,
+    dataset_joints=133,
+    dataset_channel=[
+        list(range(133)),
+    ],
+    inference_channel=list(range(133)))
+# Set models channels
+data_cfg['num_output_channels'] = channel_cfg['num_output_channels']
+data_cfg['num_joints']= channel_cfg['dataset_joints']
+data_cfg['dataset_channel']= channel_cfg['dataset_channel']
+data_cfg['inference_channel']= channel_cfg['inference_channel']
+names = ['small', 'base', 'large', 'huge']
+for name in names:
+    globals()[f'model_{name}']['keypoint_head']['out_channels'] = channel_cfg['num_output_channels']

easy_ViTPose/configs/__init__.py ADDED Viewed

File without changes

easy_ViTPose/datasets/COCO.py ADDED Viewed

	@@ -0,0 +1,556 @@

+# Part of this code is derived/taken from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+import os
+import sys
+import pickle
+import random
+import cv2
+import json_tricks as json
+import numpy as np
+from pycocotools.coco import COCO
+from torchvision import transforms
+import torchvision.transforms.functional as F
+from tqdm import tqdm
+from PIL import Image
+from .HumanPoseEstimation import HumanPoseEstimationDataset as Dataset
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from vit_utils.transform import fliplr_joints, affine_transform, get_affine_transform
+import numpy as np
+class COCODataset(Dataset):
+    """
+    COCODataset class.
+    """
+    def __init__(self, root_path="./datasets/COCO", data_version="train2017",
+                 is_train=True, use_gt_bboxes=True, bbox_path="",
+                 image_width=288, image_height=384,
+                 scale=True, scale_factor=0.35, flip_prob=0.5, rotate_prob=0.5, rotation_factor=45., half_body_prob=0.3,
+                 use_different_joints_weight=False, heatmap_sigma=3, soft_nms=False):
+        """
+        Initializes a new COCODataset object.
+        Image and annotation indexes are loaded and stored in memory.
+        Annotations are preprocessed to have a simple list of annotations to iterate over.
+        Bounding boxes can be loaded from the ground truth or from a pickle file (in this case, no annotations are
+        provided).
+        Args:
+            root_path (str): dataset root path.
+                Default: "./datasets/COCO"
+            data_version (str): desired version/folder of COCO. Possible options are "train2017", "val2017".
+                Default: "train2017"
+            is_train (bool): train or eval mode. If true, train mode is used.
+                Default: True
+            use_gt_bboxes (bool): use ground truth bounding boxes. If False, bbox_path is required.
+                Default: True
+            bbox_path (str): bounding boxes pickle file path.
+                Default: ""
+            image_width (int): image width.
+                Default: 288
+            image_height (int): image height.
+                Default: ``384``
+            color_rgb (bool): rgb or bgr color mode. If True, rgb color mode is used.
+                Default: True
+            scale (bool): scale mode.
+                Default: True
+            scale_factor (float): scale factor.
+                Default: 0.35
+            flip_prob (float): flip probability.
+                Default: 0.5
+            rotate_prob (float): rotate probability.
+                Default: 0.5
+            rotation_factor (float): rotation factor.
+                Default: 45.
+            half_body_prob (float): half body probability.
+                Default: 0.3
+            use_different_joints_weight (bool): use different joints weights.
+                If true, the following joints weights will be used:
+                [1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, 1.5]
+                Default: False
+            heatmap_sigma (float): sigma of the gaussian used to create the heatmap.
+                Default: 3
+            soft_nms (bool): enable soft non-maximum suppression.
+                Default: False
+        """
+        super(COCODataset, self).__init__()
+        self.root_path = root_path
+        self.data_version = data_version
+        self.is_train = is_train
+        self.use_gt_bboxes = use_gt_bboxes
+        self.bbox_path = bbox_path
+        self.scale = scale  # ToDo Check
+        self.scale_factor = scale_factor
+        self.flip_prob = flip_prob
+        self.rotate_prob = rotate_prob
+        self.rotation_factor = rotation_factor
+        self.half_body_prob = half_body_prob
+        self.use_different_joints_weight = use_different_joints_weight  # ToDo Check
+        self.heatmap_sigma = heatmap_sigma
+        self.soft_nms = soft_nms
+        # Image & annotation path
+        self.data_path = f"{root_path}/{data_version}"
+        self.annotation_path = f"{root_path}/annotations/person_keypoints_{data_version}.json"
+        self.image_size = (image_width, image_height)
+        self.aspect_ratio = image_width * 1.0 / image_height
+        self.heatmap_size = (int(image_width / 4), int(image_height / 4))
+        self.heatmap_type = 'gaussian'
+        self.pixel_std = 200  # I don't understand the meaning of pixel_std (=200) in the original implementation
+        self.num_joints = 25
+        self.num_joints_half_body = 15
+        # eye, ear, shoulder, elbow, wrist, hip, knee, ankle
+        self.flip_pairs = [[1, 2], [3, 4], [6, 7], [8, 9], [10, 11], [12, 13],
+                           [15, 16], [17, 18], [19, 22], [20, 23], [21, 24]]
+        self.upper_body_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+        self.lower_body_ids = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+        self.joints_weight = np.array([1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2,
+                                       1.5, 1.5, 1., 1., 1., 1.2, 1.2, 1.5, 1.5,
+                                       1.5, 1.5, 1.5, 1.5, 1.5,
+                                       1.5]).reshape((self.num_joints, 1))
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        # Load COCO dataset - Create COCO object then load images and annotations
+        self.coco = COCO(self.annotation_path)
+        # Create a list of annotations and the corresponding image (each image can contain more than one detection)
+        """ Load bboxes and joints
+        - if self.use_gt_bboxes -> Load GT bboxes and joints
+        - else -> Load pre-predicted bboxes by a detector (as YOLOv3) and null joints
+        """
+        if not self.use_gt_bboxes:
+            """
+            bboxes must be saved as the original COCO annotations
+            i.e. the format must be:
+             bboxes = {
+               '<imgId>': [
+                 {
+                   'id': <annId>,  # progressive id for debugging
+                   'clean_bbox': np.array([<x>, <y>, <w>, <h>])}
+                 },
+                 ...
+               ],
+               ...
+             }
+            """
+            with open(self.bbox_path, 'rb') as fd:
+                bboxes = pickle.load(fd)
+        self.data = []
+        # load annotations for each image of COCO
+        for imgId in tqdm(self.coco.getImgIds(), desc="Prepare images, annotations ... "):
+            ann_ids = self.coco.getAnnIds(imgIds=imgId, iscrowd=False)  # annotation ids
+            img = self.coco.loadImgs(imgId)[0]  # load img
+            if self.use_gt_bboxes:
+                objs = self.coco.loadAnns(ann_ids)
+                # sanitize bboxes
+                valid_objs = []
+                for obj in objs:
+                    # Skip non-person objects (it should never happen)
+                    if obj['category_id'] != 1:
+                        continue
+                    # ignore objs without keypoints annotation
+                    if max(obj['keypoints']) == 0 and max(obj['foot_kpts']) == 0:
+                        continue
+                    x, y, w, h = obj['bbox']
+                    x1 = np.max((0, x))
+                    y1 = np.max((0, y))
+                    x2 = np.min((img['width'] - 1, x1 + np.max((0, w - 1))))
+                    y2 = np.min((img['height'] - 1, y1 + np.max((0, h - 1))))
+                    # Use only valid bounding boxes
+                    if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                        obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                        valid_objs.append(obj)
+                objs = valid_objs
+            else:
+                objs = bboxes[imgId]
+            # for each annotation of this image, add the formatted annotation to self.data
+            for obj in objs:
+                joints = np.zeros((self.num_joints, 2), dtype=np.float)
+                joints_visibility = np.ones((self.num_joints, 2), dtype=np.float)
+                # Add foot data to keypoints
+                obj['keypoints'].extend(obj['foot_kpts'])
+                if self.use_gt_bboxes:
+                    """ COCO pre-processing
+                    - Moved above
+                    - Skip non-person objects (it should never happen)
+                    if obj['category_id'] != 1:
+                        continue
+                    # ignore objs without keypoints annotation
+                    if max(obj['keypoints']) == 0:
+                        continue
+                    """
+                    # Not all joints are already present, skip them
+                    vjoints = list(range(self.num_joints))
+                    vjoints.remove(5)
+                    vjoints.remove(14)
+                    for idx, pt in enumerate(vjoints):
+                        if pt == 5 or pt == 14:
+                            continue  # Neck and hip are manually filled
+                        joints[pt, 0] = obj['keypoints'][idx * 3 + 0]
+                        joints[pt, 1] = obj['keypoints'][idx * 3 + 1]
+                        t_vis = int(np.clip(obj['keypoints'][idx * 3 + 2], 0, 1))
+                        """
+                        - COCO:
+                          if visibility == 0 -> keypoint is not in the image.
+                          if visibility == 1 -> keypoint is in the image BUT not visible
+                                                (e.g. behind an object).
+                          if visibility == 2 -> keypoint looks clearly
+                                                (i.e. it is not hidden).
+                        """
+                        joints_visibility[pt, 0] = t_vis
+                        joints_visibility[pt, 1] = t_vis
+                center, scale = self._box2cs(obj['clean_bbox'][:4])
+                # Add neck and c-hip (check utils/visualization.py for keypoints)
+                joints[5, 0] = (joints[6, 0] + joints[7, 0]) / 2
+                joints[5, 1] = (joints[6, 1] + joints[7, 1]) / 2
+                joints_visibility[5, :] = min(joints_visibility[6, 0],
+                                              joints_visibility[7, 0])
+                joints[14, 0] = (joints[12, 0] + joints[13, 0]) / 2
+                joints[14, 1] = (joints[12, 1] + joints[13, 1]) / 2
+                joints_visibility[14, :] = min(joints_visibility[12, 0],
+                                               joints_visibility[13, 0])
+                self.data.append({
+                    'imgId': imgId,
+                    'annId': obj['id'],
+                    'imgPath': f"{self.root_path}/{self.data_version}/{imgId:012d}.jpg",
+                    'center': center,
+                    'scale': scale,
+                    'joints': joints,
+                    'joints_visibility': joints_visibility,
+                })
+        # Done check if we need prepare_data -> We should not
+        print('\nCOCO dataset loaded!')
+        # Default values
+        self.bbox_thre = 1.0
+        self.image_thre = 0.0
+        self.in_vis_thre = 0.2
+        self.nms_thre = 1.0
+        self.oks_thre = 0.9
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        # index = 0
+        joints_data = self.data[index].copy()
+        # Load image
+        try:
+            image = np.array(Image.open(joints_data['imgPath']))
+            if image.ndim == 2:
+                # Some images are grayscale and will fail the trasform, convert to RGB
+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        except:
+            raise ValueError(f"Fail to read {joints_data['imgPath']}")
+        joints = joints_data['joints']
+        joints_vis = joints_data['joints_visibility']
+        c = joints_data['center']
+        s = joints_data['scale']
+        score = joints_data['score'] if 'score' in joints_data else 1
+        r = 0
+        # Apply data augmentation
+        if self.is_train:
+            if (self.half_body_prob and random.random() < self.half_body_prob and
+                np.sum(joints_vis[:, 0]) > self.num_joints_half_body):
+                c_half_body, s_half_body = self._half_body_transform(joints, joints_vis)
+                if c_half_body is not None and s_half_body is not None:
+                    c, s = c_half_body, s_half_body
+            sf = self.scale_factor
+            rf = self.rotation_factor
+            if self.scale:
+                # A random scale factor in [1 - sf, 1 + sf]
+                s = s * np.clip(random.random() * sf + 1, 1 - sf, 1 + sf)
+            if self.rotate_prob and random.random() < self.rotate_prob:
+                # A random rotation factor in [-2 * rf, 2 * rf]
+                r = np.clip(random.random() * rf, -rf * 2, rf * 2)
+            else:
+                r = 0
+            if self.flip_prob and random.random() < self.flip_prob:
+                image = image[:, ::-1, :]
+                joints, joints_vis = fliplr_joints(joints, joints_vis,
+                                                   image.shape[1],
+                                                   self.flip_pairs)
+                c[0] = image.shape[1] - c[0] - 1
+        # Apply affine transform on joints and image
+        trans = get_affine_transform(c, s, self.pixel_std, r, self.image_size)
+        image = cv2.warpAffine(
+            image,
+            trans,
+            (int(self.image_size[0]), int(self.image_size[1])),
+            flags=cv2.INTER_LINEAR
+        )
+        for i in range(self.num_joints):
+            if joints_vis[i, 0] > 0.:
+                joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+        # Convert image to tensor and normalize
+        if self.transform is not None:  # I could remove this check
+            image = self.transform(image)
+        target, target_weight = self._generate_target(joints, joints_vis)
+        # Update metadata
+        joints_data['joints'] = joints
+        joints_data['joints_visibility'] = joints_vis
+        joints_data['center'] = c
+        joints_data['scale'] = s
+        joints_data['rotation'] = r
+        joints_data['score'] = score
+        # from utils.visualization import draw_points_and_skeleton, joints_dict
+        # image = np.rollaxis(image.detach().cpu().numpy(), 0, 3)
+        # joints = np.hstack((joints[:, ::-1], joints_vis[:, 0][..., None]))
+        # image = draw_points_and_skeleton(image.copy(), joints,
+        #                                  joints_dict()['coco']['skeleton'],
+        #                                  person_index=0,
+        #                                  points_color_palette='gist_rainbow',
+        #                                  skeleton_color_palette='jet',
+        #                                  points_palette_samples=10,
+        #                                  confidence_threshold=0.4)
+        # cv2.imshow('', image)
+        # cv2.waitKey(0)
+        return image, target.astype(np.float32), target_weight.astype(np.float32), joints_data
+    # Private methods
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2,), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.25
+        return center, scale
+    def _half_body_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(self.num_joints):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if random.random() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints \
+                if len(lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [
+                w * 1.0 / self.pixel_std,
+                h * 1.0 / self.pixel_std
+            ],
+            dtype=np.float32
+        )
+        scale = scale * 1.5
+        return center, scale
+    def _generate_target(self, joints, joints_vis):
+        """
+        :param joints:  [num_joints, 2]
+        :param joints_vis: [num_joints, 2]
+        :return: target, target_weight(1: visible, 0: invisible)
+        """
+        target_weight = np.ones((self.num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_vis[:, 0]
+        if self.heatmap_type == 'gaussian':
+            target = np.zeros((self.num_joints,
+                               self.heatmap_size[1],
+                               self.heatmap_size[0]),
+                              dtype=np.float32)
+            tmp_size = self.heatmap_sigma * 3
+            for joint_id in range(self.num_joints):
+                feat_stride = np.asarray(self.image_size) / np.asarray(self.heatmap_size)
+                mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= self.heatmap_size[0] or ul[1] >= self.heatmap_size[1] \
+                        or br[0] < 0 or br[1] < 0:
+                    # If not, just return the image as is
+                    target_weight[joint_id] = 0
+                    continue
+                # # Generate gaussian
+                size = 2 * tmp_size + 1
+                x = np.arange(0, size, 1, np.float32)
+                y = x[:, np.newaxis]
+                x0 = y0 = size // 2
+                # The gaussian is not normalized, we want the center value to equal 1
+                g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * self.heatmap_sigma ** 2))
+                # Usable gaussian range
+                g_x = max(0, -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0]
+                g_y = max(0, -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1]
+                # Image range
+                img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0])
+                img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1])
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+        else:
+            raise NotImplementedError
+        if self.use_different_joints_weight:
+            target_weight = np.multiply(target_weight, self.joints_weight)
+        return target, target_weight
+    def _write_coco_keypoint_results(self, keypoints, res_file):
+        data_pack = [
+            {
+                'cat_id': 1,  # 1 == 'person'
+                'cls': 'person',
+                'ann_type': 'keypoints',
+                'keypoints': keypoints
+            }
+        ]
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+        with open(res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+        try:
+            json.load(open(res_file))
+        except Exception:
+            content = []
+            with open(res_file, 'r') as f:
+                for line in f:
+                    content.append(line)
+            content[-1] = ']'
+            with open(res_file, 'w') as f:
+                for c in content:
+                    f.write(c)
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+            _key_points = np.array([img_kpts[k]['keypoints'] for k in range(len(img_kpts))], dtype=np.float32)
+            key_points = np.zeros((_key_points.shape[0], self.num_joints * 3), dtype=np.float32)
+            for ipt in range(self.num_joints):
+                key_points[:, ipt * 3 + 0] = _key_points[:, ipt, 0]
+                key_points[:, ipt * 3 + 1] = _key_points[:, ipt, 1]
+                key_points[:, ipt * 3 + 2] = _key_points[:, ipt, 2]  # keypoints score.
+            result = [
+                {
+                    'image_id': img_kpts[k]['image'],
+                    'category_id': cat_id,
+                    'keypoints': list(key_points[k]),
+                    'score': img_kpts[k]['score'].astype(np.float32),
+                    'center': list(img_kpts[k]['center']),
+                    'scale': list(img_kpts[k]['scale'])
+                }
+                for k in range(len(img_kpts))
+            ]
+            cat_results.extend(result)
+        return cat_results
+if __name__ == '__main__':
+    # from skimage import io
+    coco = COCODataset(root_path=f"{os.path.dirname(__file__)}/COCO", data_version="traincoex", rotate_prob=0., half_body_prob=0.)
+    item = coco[1]
+    # io.imsave("tmp.jpg", item[0].permute(1,2,0).numpy())
+    print()
+    print(item[1].shape)
+    print('ok!!')
+    # img = np.clip(np.transpose(item[0].numpy(), (1, 2, 0))[:, :, ::-1] * np.asarray([0.229, 0.224, 0.225]) +
+    #     np.asarray([0.485, 0.456, 0.406]), 0, 1) * 255
+    # cv2.imwrite('./tmp.png', img.astype(np.uint8))
+    # print(item[-1])
+    pass

easy_ViTPose/datasets/HumanPoseEstimation.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from torch.utils.data import Dataset
+class HumanPoseEstimationDataset(Dataset):
+    """
+    HumanPoseEstimationDataset class.
+    Generic class for HPE datasets.
+    """
+    def __init__(self):
+        pass
+    def __len__(self):
+        pass
+    def __getitem__(self, item):
+        pass

easy_ViTPose/datasets/__init__.py ADDED Viewed

File without changes

easy_ViTPose/easy_ViTPose.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,4 @@

+Metadata-Version: 2.1
+Name: easy-ViTPose
+Version: 0.1
+License-File: LICENSE

easy_ViTPose/easy_ViTPose.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+LICENSE
+README.md
+setup.py
+src/easy_ViTPose.egg-info/PKG-INFO
+src/easy_ViTPose.egg-info/SOURCES.txt
+src/easy_ViTPose.egg-info/dependency_links.txt
+src/easy_ViTPose.egg-info/top_level.txt
+src/vit_models/__init__.py
+src/vit_models/model.py
+src/vit_models/optimizer.py
+src/vit_models/losses/__init__.py
+src/vit_models/losses/classfication_loss.py
+src/vit_models/losses/heatmap_loss.py
+src/vit_models/losses/mesh_loss.py
+src/vit_models/losses/mse_loss.py
+src/vit_models/losses/multi_loss_factory.py
+src/vit_models/losses/regression_loss.py
+src/vit_utils/__init__.py
+src/vit_utils/dist_util.py
+src/vit_utils/inference.py
+src/vit_utils/logging.py
+src/vit_utils/top_down_eval.py
+src/vit_utils/train_valid_fn.py
+src/vit_utils/transform.py
+src/vit_utils/util.py
+src/vit_utils/visualization.py
+src/vit_utils/nms/__init__.py
+src/vit_utils/nms/nms.py
+src/vit_utils/nms/nms_ori.py
+src/vit_utils/nms/setup_linux.py
+src/vit_utils/post_processing/__init__.py
+src/vit_utils/post_processing/group.py
+src/vit_utils/post_processing/nms.py
+src/vit_utils/post_processing/one_euro_filter.py
+src/vit_utils/post_processing/post_transforms.py

easy_ViTPose/easy_ViTPose.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

easy_ViTPose/easy_ViTPose.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ vit_models
2	+ vit_utils

easy_ViTPose/inference.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import abc
+import os
+from typing import Optional
+import typing
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+from .configs.ViTPose_common import data_cfg
+from .sort import Sort
+from .vit_models.model import ViTPose
+from .vit_utils.inference import draw_bboxes, pad_image
+from .vit_utils.top_down_eval import keypoints_from_heatmaps
+from .vit_utils.util import dyn_model_import, infer_dataset_by_path
+from .vit_utils.visualization import draw_points_and_skeleton, joints_dict
+try:
+    import torch_tensorrt
+except ModuleNotFoundError:
+    pass
+try:
+    import onnxruntime
+except ModuleNotFoundError:
+    pass
+__all__ = ['VitInference']
+np.bool = np.bool_
+MEAN = [0.485, 0.456, 0.406]
+STD = [0.229, 0.224, 0.225]
+DETC_TO_YOLO_YOLOC = {
+    'human': [0],
+    'cat': [15],
+    'dog': [16],
+    'horse': [17],
+    'sheep': [18],
+    'cow': [19],
+    'elephant': [20],
+    'bear': [21],
+    'zebra': [22],
+    'giraffe': [23],
+    'animals': [15, 16, 17, 18, 19, 20, 21, 22, 23]
+}
+class VitInference:
+    """
+    Class for performing inference using ViTPose models with YOLOv8 human detection and SORT tracking.
+    Args:
+        model (str): Path to the ViT model file (.pth, .onnx, .engine).
+        yolo (str): Path of the YOLOv8 model to load.
+        model_name (str, optional): Name of the ViT model architecture to use.
+                                    Valid values are 's', 'b', 'l', 'h'.
+                                    Defaults to None, is necessary when using .pth checkpoints.
+        det_class (str, optional): the detection class. if None it is inferred by the dataset.
+                                   valid values are 'human', 'cat', 'dog', 'horse', 'sheep',
+                                                    'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+                                                    'animals' (which is all previous but human)
+        dataset (str, optional): Name of the dataset. If None it's extracted from the file name.
+                                 Valid values are 'coco', 'coco_25', 'wholebody', 'mpii',
+                                                  'ap10k', 'apt36k', 'aic'
+        yolo_size (int, optional): Size of the input image for YOLOv8 model. Defaults to 320.
+        device (str, optional): Device to use for inference. Defaults to 'cuda' if available, else 'cpu'.
+        is_video (bool, optional): Flag indicating if the input is video. Defaults to False.
+        single_pose (bool, optional): Flag indicating if the video (on images this flag has no effect)
+                                      will contain a single pose.
+                                      In this case the SORT tracker is not used (increasing performance)
+                                      but people id tracking
+                                      won't be consistent among frames.
+        yolo_step (int, optional): The tracker can be used to predict the bboxes instead of yolo for performance,
+                                   this flag specifies how often yolo is applied (e.g. 1 applies yolo every frame).
+                                   This does not have any effect when is_video is False.
+    """
+    def __init__(self, model: str,
+                 yolo: str,
+                 model_name: Optional[str] = None,
+                 det_class: Optional[str] = None,
+                 dataset: Optional[str] = None,
+                 yolo_size: Optional[int] = 320,
+                 device: Optional[str] = None,
+                 is_video: Optional[bool] = False,
+                 single_pose: Optional[bool] = False,
+                 yolo_step: Optional[int] = 1):
+        assert os.path.isfile(model), f'The model file {model} does not exist'
+        assert os.path.isfile(yolo), f'The YOLOv8 model {yolo} does not exist'
+        # Device priority is cuda / mps / cpu
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        self.device = device
+        self.yolo = YOLO(yolo, task='detect')
+        self.yolo_size = yolo_size
+        self.yolo_step = yolo_step
+        self.is_video = is_video
+        self.single_pose = single_pose
+        self.reset()
+        # State saving during inference
+        self.save_state = True  # Can be disabled manually
+        self._img = None
+        self._yolo_res = None
+        self._tracker_res = None
+        self._keypoints = None
+        # Use extension to decide which kind of model has been loaded
+        use_onnx = model.endswith('.onnx')
+        use_trt = model.endswith('.engine')
+        # Extract dataset name
+        if dataset is None:
+            dataset = infer_dataset_by_path(model)
+        assert dataset in ['mpii', 'coco', 'coco_25', 'wholebody', 'aic', 'ap10k', 'apt36k'], \
+            'The specified dataset is not valid'
+        # Dataset can now be set for visualization
+        self.dataset = dataset
+        # if we picked the dataset switch to correct yolo classes if not set
+        if det_class is None:
+            det_class = 'animals' if dataset in ['ap10k', 'apt36k'] else 'human'
+        self.yolo_classes = DETC_TO_YOLO_YOLOC[det_class]
+        assert model_name in [None, 's', 'b', 'l', 'h'], \
+            f'The model name {model_name} is not valid'
+        # onnx / trt models do not require model_cfg specification
+        if model_name is None:
+            assert use_onnx or use_trt, \
+                'Specify the model_name if not using onnx / trt'
+        else:
+            # Dynamically import the model class
+            model_cfg = dyn_model_import(self.dataset, model_name)
+        self.target_size = data_cfg['image_size']
+        if use_onnx:
+            self._ort_session = onnxruntime.InferenceSession(model,
+                                                             providers=['CUDAExecutionProvider',
+                                                                        'CPUExecutionProvider'])
+            inf_fn = self._inference_onnx
+        else:
+            self._vit_pose = ViTPose(model_cfg)
+            self._vit_pose.eval()
+            if use_trt:
+                self._vit_pose = torch.jit.load(model)
+            else:
+                ckpt = torch.load(model, map_location='cpu')
+                if 'state_dict' in ckpt:
+                    self._vit_pose.load_state_dict(ckpt['state_dict'])
+                else:
+                    self._vit_pose.load_state_dict(ckpt)
+                self._vit_pose.to(torch.device(device))
+            inf_fn = self._inference_torch
+        # Override _inference abstract with selected engine
+        self._inference = inf_fn  # type: ignore
+    def reset(self):
+        """
+        Reset the inference class to be ready for a new video.
+        This will reset the internal counter of frames, on videos
+        this is necessary to reset the tracker.
+        """
+        min_hits = 3 if self.yolo_step == 1 else 1
+        use_tracker = self.is_video and not self.single_pose
+        self.tracker = Sort(max_age=self.yolo_step,
+                            min_hits=min_hits,
+                            iou_threshold=0.3) if use_tracker else None  # TODO: Params
+        self.frame_counter = 0
+    @classmethod
+    def postprocess(cls, heatmaps, org_w, org_h):
+        """
+        Postprocess the heatmaps to obtain keypoints and their probabilities.
+        Args:
+            heatmaps (ndarray): Heatmap predictions from the model.
+            org_w (int): Original width of the image.
+            org_h (int): Original height of the image.
+        Returns:
+            ndarray: Processed keypoints with probabilities.
+        """
+        points, prob = keypoints_from_heatmaps(heatmaps=heatmaps,
+                                               center=np.array([[org_w // 2,
+                                                                 org_h // 2]]),
+                                               scale=np.array([[org_w, org_h]]),
+                                               unbiased=True, use_udp=True)
+        return np.concatenate([points[:, :, ::-1], prob], axis=2)
+    @abc.abstractmethod
+    def _inference(self, img: np.ndarray) -> np.ndarray:
+        """
+        Abstract method for performing inference on an image.
+        It is overloaded by each inference engine.
+        Args:
+            img (ndarray): Input image for inference.
+        Returns:
+            ndarray: Inference results.
+        """
+        raise NotImplementedError
+    def inference(self, img: np.ndarray) -> dict[typing.Any, typing.Any]:
+        """
+        Perform inference on the input image.
+        Args:
+            img (ndarray): Input image for inference in RGB format.
+        Returns:
+            dict[typing.Any, typing.Any]: Inference results.
+        """
+        # First use YOLOv8 for detection
+        res_pd = np.empty((0, 5))
+        results = None
+        if (self.tracker is None or
+           (self.frame_counter % self.yolo_step == 0 or self.frame_counter < 3)):
+            results = self.yolo(img, verbose=False, imgsz=self.yolo_size,
+                                device=self.device if self.device != 'cuda' else 0,
+                                classes=self.yolo_classes)[0]
+            res_pd = np.array([r[:5].tolist() for r in  # TODO: Confidence threshold
+                               results.boxes.data.cpu().numpy() if r[4] > 0.35]).reshape((-1, 5))
+        self.frame_counter += 1
+        frame_keypoints = {}
+        ids = None
+        if self.tracker is not None:
+            res_pd = self.tracker.update(res_pd)
+            ids = res_pd[:, 5].astype(int).tolist()
+        # Prepare boxes for inference
+        bboxes = res_pd[:, :4].round().astype(int)
+        scores = res_pd[:, 4].tolist()
+        pad_bbox = 10
+        if ids is None:
+            ids = range(len(bboxes))
+        for bbox, id in zip(bboxes, ids):
+            # TODO: Slightly bigger bbox
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]] + [-pad_bbox, pad_bbox], 0, img.shape[1])
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]] + [-pad_bbox, pad_bbox], 0, img.shape[0])
+            # Crop image and pad to 3/4 aspect ratio
+            img_inf = img[bbox[1]:bbox[3], bbox[0]:bbox[2]]
+            img_inf, (left_pad, top_pad) = pad_image(img_inf, 3 / 4)
+            keypoints = self._inference(img_inf)[0]
+            # Transform keypoints to original image
+            keypoints[:, :2] += bbox[:2][::-1] - [top_pad, left_pad]
+            frame_keypoints[id] = keypoints
+        if self.save_state:
+            self._img = img
+            self._yolo_res = results
+            self._tracker_res = (bboxes, ids, scores)
+            self._keypoints = frame_keypoints
+        return frame_keypoints
+    def draw(self, show_yolo=True, show_raw_yolo=False, confidence_threshold=0.5):
+        """
+        Draw keypoints and bounding boxes on the image.
+        Args:
+            show_yolo (bool, optional): Whether to show YOLOv8 bounding boxes. Default is True.
+            show_raw_yolo (bool, optional): Whether to show raw YOLOv8 bounding boxes. Default is False.
+        Returns:
+            ndarray: Image with keypoints and bounding boxes drawn.
+        """
+        img = self._img.copy()
+        bboxes, ids, scores = self._tracker_res
+        if self._yolo_res is not None and (show_raw_yolo or (self.tracker is None and show_yolo)):
+            img = np.array(self._yolo_res.plot())
+        if show_yolo and self.tracker is not None:
+            img = draw_bboxes(img, bboxes, ids, scores)
+        img = np.array(img)[..., ::-1]  # RGB to BGR for cv2 modules
+        for idx, k in self._keypoints.items():
+            img = draw_points_and_skeleton(img.copy(), k,
+                                           joints_dict()[self.dataset]['skeleton'],
+                                           person_index=idx,
+                                           points_color_palette='gist_rainbow',
+                                           skeleton_color_palette='jet',
+                                           points_palette_samples=10,
+                                           confidence_threshold=confidence_threshold)
+        return img[..., ::-1]  # Return RGB as original
+    def pre_img(self, img):
+        org_h, org_w = img.shape[:2]
+        img_input = cv2.resize(img, self.target_size, interpolation=cv2.INTER_LINEAR) / 255
+        img_input = ((img_input - MEAN) / STD).transpose(2, 0, 1)[None].astype(np.float32)
+        return img_input, org_h, org_w
+    @torch.no_grad()
+    def _inference_torch(self, img: np.ndarray) -> np.ndarray:
+        # Prepare input data
+        img_input, org_h, org_w = self.pre_img(img)
+        img_input = torch.from_numpy(img_input).to(torch.device(self.device))
+        # Feed to model
+        heatmaps = self._vit_pose(img_input).detach().cpu().numpy()
+        return self.postprocess(heatmaps, org_w, org_h)
+    def _inference_onnx(self, img: np.ndarray) -> np.ndarray:
+        # Prepare input data
+        img_input, org_h, org_w = self.pre_img(img)
+        # Feed to model
+        ort_inputs = {self._ort_session.get_inputs()[0].name: img_input}
+        heatmaps = self._ort_session.run(None, ort_inputs)[0]
+        return self.postprocess(heatmaps, org_w, org_h)

easy_ViTPose/sort.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+    SORT: A Simple, Online and Realtime Tracker
+    Copyright (C) 2016-2020 Alex Bewley alex@bewley.ai
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+from __future__ import print_function
+import os
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from skimage import io
+import glob
+import time
+import argparse
+from filterpy.kalman import KalmanFilter
+np.random.seed(0)
+def linear_assignment(cost_matrix):
+    try:
+        import lap
+        _, x, y = lap.lapjv(cost_matrix, extend_cost=True)
+        return np.array([[y[i], i] for i in x if i >= 0])
+    except ImportError:
+        from scipy.optimize import linear_sum_assignment
+        x, y = linear_sum_assignment(cost_matrix)
+        return np.array(list(zip(x, y)))
+def iou_batch(bb_test, bb_gt):
+    """
+    From SORT: Computes IOU between two bboxes in the form [x1,y1,x2,y2]
+    """
+    bb_gt = np.expand_dims(bb_gt, 0)
+    bb_test = np.expand_dims(bb_test, 1)
+    xx1 = np.maximum(bb_test[..., 0], bb_gt[..., 0])
+    yy1 = np.maximum(bb_test[..., 1], bb_gt[..., 1])
+    xx2 = np.minimum(bb_test[..., 2], bb_gt[..., 2])
+    yy2 = np.minimum(bb_test[..., 3], bb_gt[..., 3])
+    w = np.maximum(0., xx2 - xx1)
+    h = np.maximum(0., yy2 - yy1)
+    wh = w * h
+    o = wh / ((bb_test[..., 2] - bb_test[..., 0]) * (bb_test[..., 3] - bb_test[..., 1])
+              + (bb_gt[..., 2] - bb_gt[..., 0]) * (bb_gt[..., 3] - bb_gt[..., 1]) - wh)
+    return(o)
+def convert_bbox_to_z(bbox):
+    """
+    Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
+      [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
+      the aspect ratio
+    """
+    w = bbox[2] - bbox[0]
+    h = bbox[3] - bbox[1]
+    x = bbox[0] + w/2.
+    y = bbox[1] + h/2.
+    s = w * h  # scale is just area
+    r = w / float(h)
+    return np.array([x, y, s, r]).reshape((4, 1))
+def convert_x_to_bbox(x, score=None):
+    """
+    Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
+      [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
+    """
+    w = np.sqrt(x[2] * x[3])
+    h = x[2] / w
+    if(score == None):
+        return np.array([x[0]-w/2., x[1]-h/2., x[0]+w/2., x[1]+h/2.]).reshape((1, 4))
+    else:
+        return np.array([x[0]-w/2., x[1]-h/2., x[0]+w/2., x[1]+h/2., score]).reshape((1, 5))
+class KalmanBoxTracker(object):
+    """
+    This class represents the internal state of individual tracked objects observed as bbox.
+    """
+    count = 0
+    def __init__(self, bbox, score):
+        """
+        Initialises a tracker using initial bounding box.
+        """
+        # define constant velocity model
+        self.kf = KalmanFilter(dim_x=7, dim_z=4)
+        self.kf.F = np.array([[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 1], [
+                             0, 0, 0, 1, 0, 0, 0],  [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]])
+        self.kf.H = np.array([[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]])
+        self.kf.R[2:, 2:] *= 10.
+        self.kf.P[4:, 4:] *= 1000.  # give high uncertainty to the unobservable initial velocities
+        self.kf.P *= 10.
+        self.kf.Q[-1, -1] *= 0.01
+        self.kf.Q[4:, 4:] *= 0.01
+        self.kf.x[:4] = convert_bbox_to_z(bbox)
+        self.time_since_update = 0
+        self.id = KalmanBoxTracker.count
+        KalmanBoxTracker.count += 1
+        self.history = []
+        self.hits = 0
+        self.hit_streak = 0
+        self.age = 0
+        self.score = score
+    def update(self, bbox, score):
+        """
+        Updates the state vector with observed bbox.
+        """
+        self.time_since_update = 0
+        self.history = []
+        self.hits += 1
+        self.hit_streak += 1
+        self.kf.update(convert_bbox_to_z(bbox))
+        self.score = score
+    def predict(self):
+        """
+        Advances the state vector and returns the predicted bounding box estimate.
+        """
+        if((self.kf.x[6]+self.kf.x[2]) <= 0):
+            self.kf.x[6] *= 0.0
+        self.kf.predict()
+        self.age += 1
+        if(self.time_since_update > 0):
+            self.hit_streak = 0
+        self.time_since_update += 1
+        self.history.append(convert_x_to_bbox(self.kf.x))
+        return self.history[-1]
+    def get_state(self):
+        """
+        Returns the current bounding box estimate.
+        """
+        return convert_x_to_bbox(self.kf.x)
+def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3):
+    """
+    Assigns detections to tracked object (both represented as bounding boxes)
+    Returns 3 lists of matches, unmatched_detections and unmatched_trackers
+    """
+    if(len(trackers) == 0):
+        return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int)
+    iou_matrix = iou_batch(detections, trackers)
+    if min(iou_matrix.shape) > 0:
+        a = (iou_matrix > iou_threshold).astype(np.int32)
+        if a.sum(1).max() == 1 and a.sum(0).max() == 1:
+            matched_indices = np.stack(np.where(a), axis=1)
+        else:
+            matched_indices = linear_assignment(-iou_matrix)
+    else:
+        matched_indices = np.empty(shape=(0, 2))
+    unmatched_detections = []
+    for d, det in enumerate(detections):
+        if(d not in matched_indices[:, 0]):
+            unmatched_detections.append(d)
+    unmatched_trackers = []
+    for t, trk in enumerate(trackers):
+        if(t not in matched_indices[:, 1]):
+            unmatched_trackers.append(t)
+    # filter out matched with low IOU
+    matches = []
+    for m in matched_indices:
+        if(iou_matrix[m[0], m[1]] < iou_threshold):
+            unmatched_detections.append(m[0])
+            unmatched_trackers.append(m[1])
+        else:
+            matches.append(m.reshape(1, 2))
+    if(len(matches) == 0):
+        matches = np.empty((0, 2), dtype=int)
+    else:
+        matches = np.concatenate(matches, axis=0)
+    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
+class Sort(object):
+    def __init__(self, max_age=1, min_hits=3, iou_threshold=0.3):
+        """
+        Sets key parameters for SORT
+        """
+        self.max_age = max_age
+        self.min_hits = min_hits
+        self.iou_threshold = iou_threshold
+        self.trackers = []
+        self.frame_count = 0
+    def update(self, dets=np.empty((0, 5))):
+        """
+        Params:
+          dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]
+        Requires: this method must be called once for each frame even with empty detections (use np.empty((0, 5)) for frames without detections).
+        Returns the a similar array, where the last column is the object ID.
+        NOTE: The number of objects returned may differ from the number of detections provided.
+        """
+        self.frame_count += 1
+        empty_dets = dets.shape[0] == 0
+        # get predicted locations from existing trackers.
+        trks = np.zeros((len(self.trackers), 5))
+        to_del = []
+        ret = []
+        for t, trk in enumerate(trks):
+            pos = self.trackers[t].predict()[0]
+            trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
+            if np.any(np.isnan(pos)):
+                to_del.append(t)
+        trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
+        for t in reversed(to_del):
+            self.trackers.pop(t)
+        matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers(dets, trks, self.iou_threshold)
+        # update matched trackers with assigned detections
+        for m in matched:
+            self.trackers[m[1]].update(dets[m[0], :], dets[m[0], -1])
+        # create and initialise new trackers for unmatched detections
+        for i in unmatched_dets:
+            trk = KalmanBoxTracker(dets[i, :], dets[i, -1])
+            self.trackers.append(trk)
+        i = len(self.trackers)
+        unmatched = []
+        for trk in reversed(self.trackers):
+            d = trk.get_state()[0]
+            if (trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits):
+                ret.append(np.concatenate((d, [trk.score, trk.id+1])).reshape(1, -1))  # +1 as MOT benchmark requires positive
+            i -= 1
+            # remove dead tracklet
+            if(trk.time_since_update > self.max_age):
+                self.trackers.pop(i)
+            if empty_dets:
+                unmatched.append(np.concatenate((d, [trk.score, trk.id + 1])).reshape(1, -1))
+        if len(ret):
+            return np.concatenate(ret)
+        elif empty_dets:
+            return np.concatenate(unmatched) if len(unmatched) else np.empty((0, 6))
+        return np.empty((0, 6))

easy_ViTPose/to_onnx.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

easy_ViTPose/to_trt.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

easy_ViTPose/train.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+import click
+import yaml
+from glob import glob
+import torch
+import torch.distributed as dist
+from vit_utils.util import init_random_seed, set_random_seed
+from vit_utils.dist_util import get_dist_info, init_dist
+from vit_utils.logging import get_root_logger
+import configs.ViTPose_small_coco_256x192 as s_cfg
+import configs.ViTPose_base_coco_256x192 as b_cfg
+import configs.ViTPose_large_coco_256x192 as l_cfg
+import configs.ViTPose_huge_coco_256x192 as h_cfg
+from vit_models.model import ViTPose
+from datasets.COCO import COCODataset
+from vit_utils.train_valid_fn import train_model
+CUR_PATH = osp.dirname(__file__)
+@click.command()
+@click.option('--config-path', type=click.Path(exists=True), default='config.yaml', required=True, help='train config file path')
+@click.option('--model-name', type=str, default='b', required=True, help='[b: ViT-B, l: ViT-L, h: ViT-H]')
+def main(config_path, model_name):
+    cfg = {'b':b_cfg,
+           's':s_cfg,
+           'l':l_cfg,
+           'h':h_cfg}.get(model_name.lower())
+    # Load config.yaml
+    with open(config_path, 'r') as f:
+        cfg_yaml = yaml.load(f, Loader=yaml.SafeLoader)
+    for k, v in cfg_yaml.items():
+        if hasattr(cfg, k):
+            raise ValueError(f"Already exists {k} in config")
+        else:
+            cfg.__setattr__(k, v)
+    # set cudnn_benchmark
+    if cfg.cudnn_benchmark:
+        torch.backends.cudnn.benchmark = True
+    # Set work directory (session-level)
+    if not hasattr(cfg, 'work_dir'):
+        cfg.__setattr__('work_dir', f"{CUR_PATH}/runs/train")
+    if not osp.exists(cfg.work_dir):
+        os.makedirs(cfg.work_dir)
+    session_list = sorted(glob(f"{cfg.work_dir}/*"))
+    if len(session_list) == 0:
+        session = 1
+    else:
+        session = int(os.path.basename(session_list[-1])) + 1
+    session_dir = osp.join(cfg.work_dir, str(session).zfill(3))
+    os.makedirs(session_dir)
+    cfg.__setattr__('work_dir', session_dir)
+    if cfg.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+    # init distributed env first, since logger depends on the dist info.
+    if cfg.launcher == 'none':
+        distributed = False
+        if len(cfg.gpu_ids) > 1:
+            warnings.warn(
+                f"We treat {cfg['gpu_ids']} as gpu-ids, and reset to "
+                f"{cfg['gpu_ids'][0:1]} as gpu-ids to avoid potential error in "
+                "non-distribute training time.")
+            cfg.gpu_ids = cfg.gpu_ids[0:1]
+    else:
+        distributed = True
+        init_dist(cfg.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(session_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file)
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    # set random seeds
+    seed = init_random_seed(cfg.seed)
+    logger.info(f"Set random seed to {seed}, "
+                f"deterministic: {cfg.deterministic}")
+    set_random_seed(seed, deterministic=cfg.deterministic)
+    meta['seed'] = seed
+    # Set model
+    model = ViTPose(cfg.model)
+    if cfg.resume_from:
+        # Load ckpt partially
+        ckpt_state = torch.load(cfg.resume_from)['state_dict']
+        ckpt_state.pop('keypoint_head.final_layer.bias')
+        ckpt_state.pop('keypoint_head.final_layer.weight')
+        model.load_state_dict(ckpt_state, strict=False)
+        # freeze the backbone, leave the head to be finetuned
+        model.backbone.frozen_stages = model.backbone.depth - 1
+        model.backbone.freeze_ffn = True
+        model.backbone.freeze_attn = True
+        model.backbone._freeze_stages()
+    # Set dataset
+    datasets_train = COCODataset(
+        root_path=cfg.data_root,
+        data_version="feet_train",
+        is_train=True,
+        use_gt_bboxes=True,
+        image_width=192,
+        image_height=256,
+        scale=True,
+        scale_factor=0.35,
+        flip_prob=0.5,
+        rotate_prob=0.5,
+        rotation_factor=45.,
+        half_body_prob=0.3,
+        use_different_joints_weight=True,
+        heatmap_sigma=3,
+        soft_nms=False
+        )
+    datasets_valid = COCODataset(
+        root_path=cfg.data_root,
+        data_version="feet_val",
+        is_train=False,
+        use_gt_bboxes=True,
+        image_width=192,
+        image_height=256,
+        scale=False,
+        scale_factor=0.35,
+        flip_prob=0.5,
+        rotate_prob=0.5,
+        rotation_factor=45.,
+        half_body_prob=0.3,
+        use_different_joints_weight=True,
+        heatmap_sigma=3,
+        soft_nms=False
+        )
+    train_model(
+        model=model,
+        datasets_train=datasets_train,
+        datasets_valid=datasets_valid,
+        cfg=cfg,
+        distributed=distributed,
+        validate=cfg.validate,
+        timestamp=timestamp,
+        meta=meta
+        )
+if __name__ == '__main__':
+    main()

easy_ViTPose/vit_models/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import sys
+import os.path as osp
+sys.path.append(osp.dirname(osp.dirname(__file__)))
+from vit_utils.util import load_checkpoint, resize, constant_init, normal_init
+from vit_utils.top_down_eval import keypoints_from_heatmaps, pose_pck_accuracy
+from vit_utils.post_processing import *

easy_ViTPose/vit_models/backbone/__init__.py ADDED Viewed

File without changes

easy_ViTPose/vit_models/backbone/vit.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import math
+import warnings
+from itertools import repeat
+import collections.abc
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor
+# from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+# from .base_backbone import BaseBackbone
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
+    applied while sampling the normal with mean/std applied, therefore a, b args
+    should be adjusted to match the range of mean, std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        return _trunc_normal_(tensor, mean, std, a, b)
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+            )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
+    def forward(self, x):
+        x = self.proj(x)
+        B, C, Hp, Wp = x.shape
+        x = x.view(B, C, Hp * Wp).transpose(1, 2)
+        return x, (Hp, Wp)
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+class ViT(nn.Module):
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False,
+                 ):
+        super(ViT, self).__init__()
+        # Protect mutable default arguments
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                )
+            for i in range(depth)])
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding)
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+            self.apply(_init_weights)
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.last_norm(x)
+        x = x.permute(0, 2, 1).view(B, -1, Hp, Wp).contiguous()
+        return x
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()

easy_ViTPose/vit_models/head/__init__.py ADDED Viewed

File without changes

easy_ViTPose/vit_models/head/topdown_heatmap_base_head.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+import numpy as np
+import torch.nn as nn
+from .. import keypoints_from_heatmaps
+class TopdownHeatmapBaseHead(nn.Module):
+    """Base class for top-down heatmap heads.
+    All top-down heatmap heads should subclass it.
+    All subclass should overwrite:
+    Methods:`get_loss`, supporting to calculate loss.
+    Methods:`get_accuracy`, supporting to calculate accuracy.
+    Methods:`forward`, supporting to forward model.
+    Methods:`inference_model`, supporting to inference model.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def get_loss(self, **kwargs):
+        """Gets the loss."""
+    @abstractmethod
+    def get_accuracy(self, **kwargs):
+        """Gets the accuracy."""
+    @abstractmethod
+    def forward(self, **kwargs):
+        """Forward function."""
+    @abstractmethod
+    def inference_model(self, **kwargs):
+        """Inference function."""
+    def decode(self, img_metas, output, **kwargs):
+        """Decode keypoints from heatmaps.
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+        preds, maxvals = keypoints_from_heatmaps(
+            output,
+            c,
+            s,
+            unbiased=self.test_cfg.get('unbiased_decoding', False),
+            post_process=self.test_cfg.get('post_process', 'default'),
+            kernel=self.test_cfg.get('modulate_kernel', 11),
+            valid_radius_factor=self.test_cfg.get('valid_radius_factor',
+                                                  0.0546875),
+            use_udp=self.test_cfg.get('use_udp', False),
+            target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+        result = {}
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+        return result
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+        return deconv_kernel, padding, output_padding

easy_ViTPose/vit_models/head/topdown_heatmap_simple_head.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from .. import constant_init, normal_init
+from .. import pose_pck_accuracy, flip_back, resize
+import torch.nn.functional as F
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
+    Baselines for Human Pose Estimation and Tracking``.
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 upsample=0,):
+        super().__init__()
+        self.in_channels = in_channels
+        self.loss = loss_keypoint
+        self.upsample = upsample
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+                for i in range(num_conv_layers):
+                    layers.append(
+                        nn.Conv2d(in_channels=conv_channels,
+                                  out_channels=conv_channels,
+                                  kernel_size=num_conv_kernels[i],
+                                  stride=1,
+                                  padding=(num_conv_kernels[i] - 1) // 2)
+                        )
+                    layers.append(nn.BatchNorm2d(conv_channels))
+                    layers.append(nn.ReLU(inplace=True))
+            layers.append(
+                nn.Conv2d(in_channels=conv_channels,
+                          out_channels=out_channels,
+                          kernel_size=kernel_size,
+                          stride=1,
+                          padding=padding)
+                )
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+        return losses
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+        accuracy = dict()
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+        return accuracy
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            if self.upsample > 0:
+                raise NotImplementedError
+            return inputs
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(in_channels=self.in_channels,
+                                   out_channels=planes,
+                                   kernel_size=kernel,
+                                   stride=2,
+                                   padding=padding,
+                                   output_padding=output_padding,
+                                   bias=False)
+                )
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+        return nn.Sequential(*layers)
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)

easy_ViTPose/vit_models/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .classfication_loss import BCELoss
+from .heatmap_loss import AdaptiveWingLoss
+from .mesh_loss import GANLoss, MeshLoss
+from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
+from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss,
+                              SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
+                              WingLoss)
+__all__ = [
+    'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
+    'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
+    'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
+    'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss'
+]

easy_ViTPose/vit_models/losses/classfication_loss.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['BCELoss']
+class BCELoss(nn.Module):
+    """Binary Cross Entropy loss."""
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.binary_cross_entropy
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_labels: K
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+            target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
+                Weights across different labels.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output, target, reduction='none')
+            if target_weight.dim() == 1:
+                target_weight = target_weight[:, None]
+            loss = (loss * target_weight).mean()
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight

easy_ViTPose/vit_models/losses/heatmap_loss.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+class AdaptiveWingLoss(nn.Module):
+    """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
+    Alignment via Heatmap Regression' Wang et al. ICCV'2019.
+    Args:
+        alpha (float), omega (float), epsilon (float), theta (float)
+            are hyper-parameters.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self,
+                 alpha=2.1,
+                 omega=14,
+                 epsilon=1,
+                 theta=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.alpha = float(alpha)
+        self.omega = float(omega)
+        self.epsilon = float(epsilon)
+        self.theta = float(theta)
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+        """
+        H, W = pred.shape[2:4]
+        delta = (target - pred).abs()
+        A = self.omega * (
+            1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        ) * (self.alpha - target) * (torch.pow(
+            self.theta / self.epsilon,
+            self.alpha - target - 1)) * (1 / self.epsilon)
+        C = self.theta * A - self.omega * torch.log(
+            1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        losses = torch.where(
+            delta < self.theta,
+            self.omega *
+            torch.log(1 +
+                      torch.pow(delta / self.epsilon, self.alpha - target)),
+            A * delta - C)
+        return torch.mean(losses)
+    def forward(self, output, target, target_weight):
+        """Forward function.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            loss = self.criterion(output * target_weight.unsqueeze(-1),
+                                  target * target_weight.unsqueeze(-1))
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight

easy_ViTPose/vit_models/losses/mesh_loss.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+__all__ = ['MeshLoss', 'GANLoss']
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    l2norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l2norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\
+        norm_quat[:, 2], norm_quat[:, 3]
+    B = quat.size(0)
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of 3D points.
+    Note:
+        - batch size: B
+        - point number: N
+    Args:
+        points (Tensor([B, N, 3])): A set of 3D points
+        rotation (Tensor([B, 3, 3])): Camera rotation matrix
+        translation (Tensor([B, 3])): Camera translation
+        focal_length (Tensor([B,])): Focal length
+        camera_center (Tensor([B, 2])): Camera center
+    Returns:
+        projected_points (Tensor([B, N, 2])): Projected 2D
+            points in image space.
+    """
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    projected_points = projected_points[:, :, :-1]
+    return projected_points
+class MeshLoss(nn.Module):
+    """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D
+    joints, mesh vertices and smpl parameters (if any).
+    Args:
+        joints_2d_loss_weight (float): Weight for loss on 2D joints.
+        joints_3d_loss_weight (float): Weight for loss on 3D joints.
+        vertex_loss_weight (float): Weight for loss on 3D verteices.
+        smpl_pose_loss_weight (float): Weight for loss on SMPL
+            pose parameters.
+        smpl_beta_loss_weight (float): Weight for loss on SMPL
+            shape parameters.
+        img_res (int): Input image resolution.
+        focal_length (float): Focal length of camera model. Default=5000.
+    """
+    def __init__(self,
+                 joints_2d_loss_weight,
+                 joints_3d_loss_weight,
+                 vertex_loss_weight,
+                 smpl_pose_loss_weight,
+                 smpl_beta_loss_weight,
+                 img_res,
+                 focal_length=5000):
+        super().__init__()
+        # Per-vertex loss on the mesh
+        self.criterion_vertex = nn.L1Loss(reduction='none')
+        # Joints (2D and 3D) loss
+        self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none')
+        self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none')
+        # Loss for SMPL parameter regression
+        self.criterion_regr = nn.MSELoss(reduction='none')
+        self.joints_2d_loss_weight = joints_2d_loss_weight
+        self.joints_3d_loss_weight = joints_3d_loss_weight
+        self.vertex_loss_weight = vertex_loss_weight
+        self.smpl_pose_loss_weight = smpl_pose_loss_weight
+        self.smpl_beta_loss_weight = smpl_beta_loss_weight
+        self.focal_length = focal_length
+        self.img_res = img_res
+    def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible):
+        """Compute 2D reprojection loss on the joints.
+        The loss is weighted by joints_2d_visible.
+        """
+        conf = joints_2d_visible.float()
+        loss = (conf *
+                self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean()
+        return loss
+    def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible):
+        """Compute 3D joints loss for the examples that 3D joint annotations
+        are available.
+        The loss is weighted by joints_3d_visible.
+        """
+        conf = joints_3d_visible.float()
+        if len(gt_joints_3d) > 0:
+            gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2
+            gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+            pred_pelvis = (pred_joints_3d[:, 2, :] +
+                           pred_joints_3d[:, 3, :]) / 2
+            pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+            return (
+                conf *
+                self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean()
+        return pred_joints_3d.sum() * 0
+    def vertex_loss(self, pred_vertices, gt_vertices, has_smpl):
+        """Compute 3D vertex loss for the examples that 3D human mesh
+        annotations are available.
+        The loss is weighted by the has_smpl.
+        """
+        conf = has_smpl.float()
+        loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices)
+        loss_vertex = (conf[:, None, None] * loss_vertex).mean()
+        return loss_vertex
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas,
+                    has_smpl):
+        """Compute SMPL parameters loss for the examples that SMPL parameter
+        annotations are available.
+        The loss is weighted by has_smpl.
+        """
+        conf = has_smpl.float()
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat)
+        loss_regr_betas = self.criterion_regr(pred_betas, gt_betas)
+        loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean()
+        loss_regr_betas = (conf[:, None] * loss_regr_betas).mean()
+        return loss_regr_pose, loss_regr_betas
+    def project_points(self, points_3d, camera):
+        """Perform orthographic projection of 3D points using the camera
+        parameters, return projected 2D points in image plane.
+        Note:
+            - batch size: B
+            - point number: N
+        Args:
+            points_3d (Tensor([B, N, 3])): 3D points.
+            camera (Tensor([B, 3])): camera parameters with the
+                3 channel as (scale, translation_x, translation_y)
+        Returns:
+            Tensor([B, N, 2]): projected 2D points \
+                in image space.
+        """
+        batch_size = points_3d.shape[0]
+        device = points_3d.device
+        cam_t = torch.stack([
+            camera[:, 1], camera[:, 2], 2 * self.focal_length /
+            (self.img_res * camera[:, 0] + 1e-9)
+        ],
+                            dim=-1)
+        camera_center = camera.new_zeros([batch_size, 2])
+        rot_t = torch.eye(
+            3, device=device,
+            dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+        joints_2d = perspective_projection(
+            points_3d,
+            rotation=rot_t,
+            translation=cam_t,
+            focal_length=self.focal_length,
+            camera_center=camera_center)
+        return joints_2d
+    def forward(self, output, target):
+        """Forward function.
+        Args:
+            output (dict): dict of network predicted results.
+                Keys: 'vertices', 'joints_3d', 'camera',
+                'pose'(optional), 'beta'(optional)
+            target (dict): dict of ground-truth labels.
+                Keys: 'vertices', 'joints_3d', 'joints_3d_visible',
+                'joints_2d', 'joints_2d_visible', 'pose', 'beta',
+                'has_smpl'
+        Returns:
+            dict: dict of losses.
+        """
+        losses = {}
+        # Per-vertex loss for the shape
+        pred_vertices = output['vertices']
+        gt_vertices = target['vertices']
+        has_smpl = target['has_smpl']
+        loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl)
+        losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight
+        # Compute loss on SMPL parameters, if available
+        if 'pose' in output.keys() and 'beta' in output.keys():
+            pred_rotmat = output['pose']
+            pred_betas = output['beta']
+            gt_pose = target['pose']
+            gt_betas = target['beta']
+            loss_regr_pose, loss_regr_betas = self.smpl_losses(
+                pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl)
+            losses['smpl_pose_loss'] = \
+                loss_regr_pose * self.smpl_pose_loss_weight
+            losses['smpl_beta_loss'] = \
+                loss_regr_betas * self.smpl_beta_loss_weight
+        # Compute 3D joints loss
+        pred_joints_3d = output['joints_3d']
+        gt_joints_3d = target['joints_3d']
+        joints_3d_visible = target['joints_3d_visible']
+        loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d,
+                                             joints_3d_visible)
+        losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight
+        # Compute 2D reprojection loss for the 2D joints
+        pred_camera = output['camera']
+        gt_joints_2d = target['joints_2d']
+        joints_2d_visible = target['joints_2d_visible']
+        pred_joints_2d = self.project_points(pred_joints_3d, pred_camera)
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_joints_2d is
+        #  the center of the input image.
+        pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1)
+        # The coordinate origin of gt_joints_2d is
+        # the top left corner of the input image.
+        gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1
+        loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d,
+                                             joints_2d_visible)
+        losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight
+        return losses
+class GANLoss(nn.Module):
+    """Define GAN loss.
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, \
+                otherwise, return Tensor.
+        """
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (
+            self.real_label_val if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight

easy_ViTPose/vit_models/losses/mse_loss.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+__all__ = ['JointsMSELoss', 'JointsOHKMMSELoss',]
+class JointsMSELoss(nn.Module):
+    """MSE loss for heatmaps.
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+        loss = 0.
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                loss += self.criterion(heatmap_pred * target_weight[:, idx],
+                                       heatmap_gt * target_weight[:, idx])
+            else:
+                loss += self.criterion(heatmap_pred, heatmap_gt)
+        return loss / num_joints * self.loss_weight
+class CombinedTargetMSELoss(nn.Module):
+    """MSE loss for combined target.
+        CombinedTarget: The combination of classification target
+        (response map) and regression target (offset map).
+        Paper ref: Huang et al. The Devil is in the Details: Delving into
+        Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, use_target_weight, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight):
+        batch_size = output.size(0)
+        num_channels = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        loss = 0.
+        num_joints = num_channels // 3
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx * 3].squeeze()
+            heatmap_gt = heatmaps_gt[idx * 3].squeeze()
+            offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
+            offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
+            offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
+            offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
+            if self.use_target_weight:
+                heatmap_pred = heatmap_pred * target_weight[:, idx]
+                heatmap_gt = heatmap_gt * target_weight[:, idx]
+            # classification loss
+            loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
+            # regression loss
+            loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
+                                         heatmap_gt * offset_x_gt)
+            loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
+                                         heatmap_gt * offset_y_gt)
+        return loss / num_joints * self.loss_weight
+class JointsOHKMMSELoss(nn.Module):
+    """MSE loss with online hard keypoint mining.
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        topk (int): Only top k joint losses are kept.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
+        super().__init__()
+        assert topk > 0
+        self.criterion = nn.MSELoss(reduction='none')
+        self.use_target_weight = use_target_weight
+        self.topk = topk
+        self.loss_weight = loss_weight
+    def _ohkm(self, loss):
+        """Online hard keypoint mining."""
+        ohkm_loss = 0.
+        N = len(loss)
+        for i in range(N):
+            sub_loss = loss[i]
+            _, topk_idx = torch.topk(
+                sub_loss, k=self.topk, dim=0, sorted=False)
+            tmp_loss = torch.gather(sub_loss, 0, topk_idx)
+            ohkm_loss += torch.sum(tmp_loss) / self.topk
+        ohkm_loss /= N
+        return ohkm_loss
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        if num_joints < self.topk:
+            raise ValueError(f'topk ({self.topk}) should not '
+                             f'larger than num_joints ({num_joints}).')
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+        losses = []
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                losses.append(
+                    self.criterion(heatmap_pred * target_weight[:, idx],
+                                   heatmap_gt * target_weight[:, idx]))
+            else:
+                losses.append(self.criterion(heatmap_pred, heatmap_gt))
+        losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
+        losses = torch.cat(losses, dim=1)
+        return self._ohkm(losses) * self.loss_weight

easy_ViTPose/vit_models/losses/multi_loss_factory.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+__all__ = ['HeatmapLoss', 'AELoss', 'MultiLossFactory']
+def _make_input(t, requires_grad=False, device=torch.device('cpu')):
+    """Make zero inputs for AE loss.
+    Args:
+        t (torch.Tensor): input
+        requires_grad (bool): Option to use requires_grad.
+        device: torch device
+    Returns:
+        torch.Tensor: zero input.
+    """
+    inp = torch.autograd.Variable(t, requires_grad=requires_grad)
+    inp = inp.sum()
+    inp = inp.to(device)
+    return inp
+class HeatmapLoss(nn.Module):
+    """Accumulate the heatmap loss for each image in the batch.
+    Args:
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+    def __init__(self, supervise_empty=True):
+        super().__init__()
+        self.supervise_empty = supervise_empty
+    def forward(self, pred, gt, mask):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+        Args:
+            pred (torch.Tensor[N,K,H,W]):heatmap of output.
+            gt (torch.Tensor[N,K,H,W]): target heatmap.
+            mask (torch.Tensor[N,H,W]): mask of target.
+        """
+        assert pred.size() == gt.size(
+        ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
+        if not self.supervise_empty:
+            empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
+            loss = ((pred - gt)**2) * empty_mask.expand_as(
+                pred) * mask[:, None, :, :].expand_as(pred)
+        else:
+            loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
+        loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
+        return loss
+class AELoss(nn.Module):
+    """Associative Embedding loss.
+    `Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping <https://arxiv.org/abs/1611.05424v2>`_.
+    """
+    def __init__(self, loss_type):
+        super().__init__()
+        self.loss_type = loss_type
+    def singleTagLoss(self, pred_tag, joints):
+        """Associative embedding loss for one image.
+        Note:
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+        Args:
+            pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
+            joints (torch.Tensor[M,K,2]): joints information for one image.
+        """
+        tags = []
+        pull = 0
+        for joints_per_person in joints:
+            tmp = []
+            for joint in joints_per_person:
+                if joint[1] > 0:
+                    tmp.append(pred_tag[joint[0]])
+            if len(tmp) == 0:
+                continue
+            tmp = torch.stack(tmp)
+            tags.append(torch.mean(tmp, dim=0))
+            pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
+        num_tags = len(tags)
+        if num_tags == 0:
+            return (
+                _make_input(torch.zeros(1).float(), device=pred_tag.device),
+                _make_input(torch.zeros(1).float(), device=pred_tag.device))
+        elif num_tags == 1:
+            return (_make_input(
+                torch.zeros(1).float(), device=pred_tag.device), pull)
+        tags = torch.stack(tags)
+        size = (num_tags, num_tags)
+        A = tags.expand(*size)
+        B = A.permute(1, 0)
+        diff = A - B
+        if self.loss_type == 'exp':
+            diff = torch.pow(diff, 2)
+            push = torch.exp(-diff)
+            push = torch.sum(push) - num_tags
+        elif self.loss_type == 'max':
+            diff = 1 - torch.abs(diff)
+            push = torch.clamp(diff, min=0).sum() - num_tags
+        else:
+            raise ValueError('Unknown ae loss type')
+        push_loss = push / ((num_tags - 1) * num_tags) * 0.5
+        pull_loss = pull / (num_tags)
+        return push_loss, pull_loss
+    def forward(self, tags, joints):
+        """Accumulate the tag loss for each image in the batch.
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+        Args:
+            tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
+            joints (torch.Tensor[N,M,K,2]): joints information.
+        """
+        pushes, pulls = [], []
+        joints = joints.cpu().data.numpy()
+        batch_size = tags.size(0)
+        for i in range(batch_size):
+            push, pull = self.singleTagLoss(tags[i], joints[i])
+            pushes.append(push)
+            pulls.append(pull)
+        return torch.stack(pushes), torch.stack(pulls)
+class MultiLossFactory(nn.Module):
+    """Loss for bottom-up models.
+    Args:
+        num_joints (int): Number of keypoints.
+        num_stages (int): Number of stages.
+        ae_loss_type (str): Type of ae loss.
+        with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
+        push_loss_factor (list[float]):
+            Parameter of push loss in multi-heatmap.
+        pull_loss_factor (list[float]):
+            Parameter of pull loss in multi-heatmap.
+        with_heatmap_loss (list[bool]):
+            Use heatmap loss or not in multi-heatmap.
+        heatmaps_loss_factor (list[float]):
+            Parameter of heatmap loss in multi-heatmap.
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+    def __init__(self,
+                 num_joints,
+                 num_stages,
+                 ae_loss_type,
+                 with_ae_loss,
+                 push_loss_factor,
+                 pull_loss_factor,
+                 with_heatmaps_loss,
+                 heatmaps_loss_factor,
+                 supervise_empty=True):
+        super().__init__()
+        assert isinstance(with_heatmaps_loss, (list, tuple)), \
+            'with_heatmaps_loss should be a list or tuple'
+        assert isinstance(heatmaps_loss_factor, (list, tuple)), \
+            'heatmaps_loss_factor should be a list or tuple'
+        assert isinstance(with_ae_loss, (list, tuple)), \
+            'with_ae_loss should be a list or tuple'
+        assert isinstance(push_loss_factor, (list, tuple)), \
+            'push_loss_factor should be a list or tuple'
+        assert isinstance(pull_loss_factor, (list, tuple)), \
+            'pull_loss_factor should be a list or tuple'
+        self.num_joints = num_joints
+        self.num_stages = num_stages
+        self.ae_loss_type = ae_loss_type
+        self.with_ae_loss = with_ae_loss
+        self.push_loss_factor = push_loss_factor
+        self.pull_loss_factor = pull_loss_factor
+        self.with_heatmaps_loss = with_heatmaps_loss
+        self.heatmaps_loss_factor = heatmaps_loss_factor
+        self.heatmaps_loss = \
+            nn.ModuleList(
+                [
+                    HeatmapLoss(supervise_empty)
+                    if with_heatmaps_loss else None
+                    for with_heatmaps_loss in self.with_heatmaps_loss
+                ]
+            )
+        self.ae_loss = \
+            nn.ModuleList(
+                [
+                    AELoss(self.ae_loss_type) if with_ae_loss else None
+                    for with_ae_loss in self.with_ae_loss
+                ]
+            )
+    def forward(self, outputs, heatmaps, masks, joints):
+        """Forward function to calculate losses.
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+            - output_channel: C C=2K if use ae loss else K
+        Args:
+            outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
+            heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
+            masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
+            joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
+        """
+        heatmaps_losses = []
+        push_losses = []
+        pull_losses = []
+        for idx in range(len(outputs)):
+            offset_feat = 0
+            if self.heatmaps_loss[idx]:
+                heatmaps_pred = outputs[idx][:, :self.num_joints]
+                offset_feat = self.num_joints
+                heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
+                                                        heatmaps[idx],
+                                                        masks[idx])
+                heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
+                heatmaps_losses.append(heatmaps_loss)
+            else:
+                heatmaps_losses.append(None)
+            if self.ae_loss[idx]:
+                tags_pred = outputs[idx][:, offset_feat:]
+                batch_size = tags_pred.size()[0]
+                tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
+                push_loss, pull_loss = self.ae_loss[idx](tags_pred,
+                                                         joints[idx])
+                push_loss = push_loss * self.push_loss_factor[idx]
+                pull_loss = pull_loss * self.pull_loss_factor[idx]
+                push_losses.append(push_loss)
+                pull_losses.append(pull_loss)
+            else:
+                push_losses.append(None)
+                pull_losses.append(None)
+        return heatmaps_losses, push_losses, pull_losses

easy_ViTPose/vit_models/losses/regression_loss.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['SmoothL1Loss', 'SoftWingLoss', 'SoftWingLoss',
+           'L1Loss', 'MPJPELoss', 'MSELoss', 'BoneLoss',
+           'SemiSupervisionLoss']
+class SmoothL1Loss(nn.Module):
+    """SmoothL1Loss loss.
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.smooth_l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight
+class WingLoss(nn.Module):
+    """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation
+    with Convolutional Neural Networks' Feng et al. CVPR'2018.
+    Args:
+        omega (float): Also referred to as width.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self,
+                 omega=10.0,
+                 epsilon=2.0,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon))
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega,
+            self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight
+class SoftWingLoss(nn.Module):
+    """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face
+    Alignment' Lin et al. TIP'2021.
+    loss =
+        1. |x|                           , if |x| < omega1
+        2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1
+    Args:
+        omega1 (float): The first threshold.
+        omega2 (float): The second threshold.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self,
+                 omega1=2.0,
+                 omega2=20.0,
+                 epsilon=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 /
+                                                      self.epsilon)
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega1, delta,
+            self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight
+class MPJPELoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+        else:
+            loss = torch.mean(torch.norm(output - target, dim=-1))
+        return loss * self.loss_weight
+class L1Loss(nn.Module):
+    """L1Loss loss ."""
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight
+class MSELoss(nn.Module):
+    """MSE loss for coordinate regression."""
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.mse_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+        return loss * self.loss_weight
+class BoneLoss(nn.Module):
+    """Bone length loss.
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        use_target_weight (bool): Option to use weighted bone loss.
+            Different bone types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.joint_parents = joint_parents
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+        self.non_root_indices = []
+        for i in range(len(self.joint_parents)):
+            if i != self.joint_parents[i]:
+                self.non_root_indices.append(i)
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K-1]):
+                Weights across different bone types.
+        """
+        output_bone = torch.norm(
+            output - output[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        target_bone = torch.norm(
+            target - target[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.abs((output_bone * target_weight).mean(dim=0) -
+                          (target_bone * target_weight).mean(dim=0)))
+        else:
+            loss = torch.mean(
+                torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0)))
+        return loss * self.loss_weight
+class SemiSupervisionLoss(nn.Module):
+    """Semi-supervision loss for unlabeled data. It is composed of projection
+    loss and bone loss.
+    Paper ref: `3D human pose estimation in video with temporal convolutions
+    and semi-supervised training` Dario Pavllo et al. CVPR'2019.
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        projection_loss_weight (float): Weight for projection loss.
+        bone_loss_weight (float): Weight for bone loss.
+        warmup_iterations (int): Number of warmup iterations. In the first
+            `warmup_iterations` iterations, the model is trained only on
+            labeled data, and semi-supervision loss will be 0.
+            This is a workaround since currently we cannot access
+            epoch number in loss functions. Note that the iteration number in
+            an epoch can be changed due to different GPU numbers in multi-GPU
+            settings. So please set this parameter carefully.
+            warmup_iterations = dataset_size // samples_per_gpu // gpu_num
+            * warmup_epochs
+    """
+    def __init__(self,
+                 joint_parents,
+                 projection_loss_weight=1.,
+                 bone_loss_weight=1.,
+                 warmup_iterations=0):
+        super().__init__()
+        self.criterion_projection = MPJPELoss(
+            loss_weight=projection_loss_weight)
+        self.criterion_bone = BoneLoss(
+            joint_parents, loss_weight=bone_loss_weight)
+        self.warmup_iterations = warmup_iterations
+        self.num_iterations = 0
+    @staticmethod
+    def project_joints(x, intrinsics):
+        """Project 3D joint coordinates to 2D image plane using camera
+        intrinsic parameters.
+        Args:
+            x (torch.Tensor[N, K, 3]): 3D joint coordinates.
+            intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera
+                intrinsics: f (2), c (2), k (3), p (2).
+        """
+        while intrinsics.dim() < x.dim():
+            intrinsics.unsqueeze_(1)
+        f = intrinsics[..., :2]
+        c = intrinsics[..., 2:4]
+        _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1)
+        if intrinsics.shape[-1] == 9:
+            k = intrinsics[..., 4:7]
+            p = intrinsics[..., 7:9]
+            r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True)
+            radial = 1 + torch.sum(
+                k * torch.cat((r2, r2**2, r2**3), dim=-1),
+                dim=-1,
+                keepdim=True)
+            tan = torch.sum(p * _x, dim=-1, keepdim=True)
+            _x = _x * (radial + tan) + p * r2
+        _x = f * _x + c
+        return _x
+    def forward(self, output, target):
+        losses = dict()
+        self.num_iterations += 1
+        if self.num_iterations <= self.warmup_iterations:
+            return losses
+        labeled_pose = output['labeled_pose']
+        unlabeled_pose = output['unlabeled_pose']
+        unlabeled_traj = output['unlabeled_traj']
+        unlabeled_target_2d = target['unlabeled_target_2d']
+        intrinsics = target['intrinsics']
+        # projection loss
+        unlabeled_output = unlabeled_pose + unlabeled_traj
+        unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics)
+        loss_proj = self.criterion_projection(unlabeled_output_2d,
+                                              unlabeled_target_2d, None)
+        losses['proj_loss'] = loss_proj
+        # bone loss
+        loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None)
+        losses['bone_loss'] = loss_bone
+        return losses

easy_ViTPose/vit_models/model.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch.nn as nn
+from .backbone.vit import ViT
+from .head.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
+__all__ = ['ViTPose']
+class ViTPose(nn.Module):
+    def __init__(self, cfg: dict) -> None:
+        super(ViTPose, self).__init__()
+        backbone_cfg = {k: v for k, v in cfg['backbone'].items() if k != 'type'}
+        head_cfg = {k: v for k, v in cfg['keypoint_head'].items() if k != 'type'}
+        self.backbone = ViT(**backbone_cfg)
+        self.keypoint_head = TopdownHeatmapSimpleHead(**head_cfg)
+    def forward_features(self, x):
+        return self.backbone(x)
+    def forward(self, x):
+        return self.keypoint_head(self.backbone(x))

easy_ViTPose/vit_models/optimizer.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch.optim as optim
+class LayerDecayOptimizer:
+    def __init__(self, optimizer, layerwise_decay_rate):
+        self.optimizer = optimizer
+        self.layerwise_decay_rate = layerwise_decay_rate
+        self.param_groups = optimizer.param_groups
+    def step(self, *args, **kwargs):
+        for i, group in enumerate(self.optimizer.param_groups):
+            group['lr'] *= self.layerwise_decay_rate[i]
+        self.optimizer.step(*args, **kwargs)
+    def zero_grad(self, *args, **kwargs):
+        self.optimizer.zero_grad(*args, **kwargs)

easy_ViTPose/vit_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .util import *
+from .top_down_eval import *
+from .post_processing import *
+from .visualization import *
+from .dist_util import *
+from .logging import *

easy_ViTPose/vit_utils/dist_util.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import os
+import socket
+import subprocess
+from collections import OrderedDict
+from typing import Callable, List, Optional, Tuple
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+def is_mps_available() -> bool:
+    """Return True if mps devices exist.
+    It's specialized for mac m1 chips and require torch version 1.12 or higher.
+    """
+    try:
+        import torch
+        return hasattr(torch.backends,
+                       'mps') and torch.backends.mps.is_available()
+    except Exception:
+        return False
+def _find_free_port() -> str:
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+def _is_free_port(port: int) -> bool:
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+def _init_dist_pytorch(backend: str, **kwargs) -> None:
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_mpi(backend: str, **kwargs) -> None:
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
+    """Initialize slurm distributed training environment.
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # if torch.distributed default port(29500) is available
+        # then use it, else find a free port
+        if _is_free_port(29500):
+            os.environ['MASTER_PORT'] = '29500'
+        else:
+            os.environ['MASTER_PORT'] = str(_find_free_port())
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+def get_dist_info() -> Tuple[int, int]:
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def master_only(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
+def allreduce_params(params: List[torch.nn.Parameter],
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1) -> None:
+    """Allreduce parameters.
+    Args:
+        params (list[torch.nn.Parameter]): List of parameters or buffers
+            of a model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    params = [param.data for param in params]
+    if coalesce:
+        _allreduce_coalesced(params, world_size, bucket_size_mb)
+    else:
+        for tensor in params:
+            dist.all_reduce(tensor.div_(world_size))
+def allreduce_grads(params: List[torch.nn.Parameter],
+                    coalesce: bool = True,
+                    bucket_size_mb: int = -1) -> None:
+    """Allreduce gradients.
+    Args:
+        params (list[torch.nn.Parameter]): List of parameters of a model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+def _allreduce_coalesced(tensors: torch.Tensor,
+                         world_size: int,
+                         bucket_size_mb: int = -1) -> None:
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)

easy_ViTPose/vit_utils/inference.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import cv2
+import numpy as np
+import json
+rotation_map = {
+    0: None,
+    90: cv2.ROTATE_90_COUNTERCLOCKWISE,
+    180: cv2.ROTATE_180,
+    270: cv2.ROTATE_90_CLOCKWISE
+}
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+def draw_bboxes(image, bounding_boxes, boxes_id, scores):
+    image_with_boxes = image.copy()
+    for bbox, bbox_id, score in zip(bounding_boxes, boxes_id, scores):
+        x1, y1, x2, y2 = bbox
+        cv2.rectangle(image_with_boxes, (x1, y1), (x2, y2), (128, 128, 0), 2)
+        label = f'#{bbox_id}: {score:.2f}'
+        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        label_x = x1
+        label_y = y1 - 5 if y1 > 20 else y1 + 20
+        # Draw a filled rectangle as the background for the label
+        cv2.rectangle(image_with_boxes, (x1, label_y - label_height - 5),
+                      (x1 + label_width, label_y + 5), (128, 128, 0), cv2.FILLED)
+        cv2.putText(image_with_boxes, label, (label_x, label_y),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
+    return image_with_boxes
+def pad_image(image: np.ndarray, aspect_ratio: float) -> np.ndarray:
+    # Get the current aspect ratio of the image
+    image_height, image_width = image.shape[:2]
+    current_aspect_ratio = image_width / image_height
+    left_pad = 0
+    top_pad = 0
+    # Determine whether to pad horizontally or vertically
+    if current_aspect_ratio < aspect_ratio:
+        # Pad horizontally
+        target_width = int(aspect_ratio * image_height)
+        pad_width = target_width - image_width
+        left_pad = pad_width // 2
+        right_pad = pad_width - left_pad
+        padded_image = np.pad(image,
+                              pad_width=((0, 0), (left_pad, right_pad), (0, 0)),
+                              mode='constant')
+    else:
+        # Pad vertically
+        target_height = int(image_width / aspect_ratio)
+        pad_height = target_height - image_height
+        top_pad = pad_height // 2
+        bottom_pad = pad_height - top_pad
+        padded_image = np.pad(image,
+                              pad_width=((top_pad, bottom_pad), (0, 0), (0, 0)),
+                              mode='constant')
+    return padded_image, (left_pad, top_pad)
+class VideoReader(object):
+    def __init__(self, file_name, rotate=0):
+        self.file_name = file_name
+        self.rotate = rotation_map[rotate]
+        try:  # OpenCV needs int to read from webcam
+            self.file_name = int(file_name)
+        except ValueError:
+            pass
+    def __iter__(self):
+        self.cap = cv2.VideoCapture(self.file_name)
+        if not self.cap.isOpened():
+            raise IOError('Video {} cannot be opened'.format(self.file_name))
+        return self
+    def __next__(self):
+        was_read, img = self.cap.read()
+        if not was_read:
+            raise StopIteration
+        if self.rotate is not None:
+            img = cv2.rotate(img, self.rotate)
+        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

easy_ViTPose/vit_utils/logging.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import torch.distributed as dist
+logger_initialized: dict = {}
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+    logger_initialized[name] = True
+    return logger
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Use `get_logger` method in mmcv to get the root logger.
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmpose".
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+    Returns:
+        logging.Logger: The root logger.
+    """
+    return get_logger(__name__.split('.')[0], log_file, log_level)

easy_ViTPose/vit_utils/nms/__init__.py ADDED Viewed

File without changes

easy_ViTPose/vit_utils/nms/cpu_nms.c ADDED Viewed

The diff for this file is too large to render. See raw diff

easy_ViTPose/vit_utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so ADDED Viewed

Binary file (264 kB). View file